diff options
-rw-r--r-- | lesana/collection.py | 79 | ||||
-rw-r--r-- | tests/test_collection.py | 14 |
2 files changed, 65 insertions, 28 deletions
diff --git a/lesana/collection.py b/lesana/collection.py index 9b44b54..64b4246 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -10,18 +10,16 @@ class Entry(object): self.collection = collection self.data = data - def index(self): - # TODO: maybe add the ability to add a language specific to the - # entry, and change the stemmer - doc = xapian.Document() - indexer = xapian.TermGenerator() - indexer.set_stemmer(self.collection.stemmer) - # FIXME: this is obviously wrong, actually index the right - # things - doc.set_data(str(self.data)) - indexer.set_document(doc) - indexer.index_text(str(self.data)) - self.collection.cache.add_document(doc) + def indexed_fields(self): + return [] + + @property + def yaml_data(self): + return "" + + @property + def idterm(self): + return "Q" @@ -37,10 +35,6 @@ class Collection(object): except FileNotFoundError: self.schema = ruamel.yaml.load("{}") os.makedirs(os.path.join(self.basedir, '.lesana'), exist_ok=True) - self.cache = xapian.WritableDatabase( - os.path.join(self.basedir, '.lesana/xapian'), - xapian.DB_CREATE_OR_OPEN - ) if 'lang' in self.schema: try: self.stemmer = xapian.Stem(self.schema['lang']) @@ -56,12 +50,46 @@ class Collection(object): # safe_load or load + RoundTripLoader self.safe = True + def _index_file(self, fname): + with open(os.path.join(self.basedir, 'items', fname)) as fp: + if self.safe: + data = ruamel.yaml.safe_load(fp) + else: + data = ruamel.yaml.load(fp, ruamel.yaml.RoundTripLoader) + entry = Entry(self, data) + + doc = xapian.Document() + self.indexer.set_document(doc) + # FIXME: this is obviously wrong, actually index the right + # things + + # Fields with prefix, for field search + for field in entry.indexed_fields(): + self.indexer.index_text(field['value'], 1, field['prefix']) + # unprefixed fields, for full text search + for field in entry.indexed_fields(): + if field.get('free_search', False): + self.indexer.index_text(field['value']) + self.indexer.increase_termpos() + doc.set_data(entry.yaml_data) + doc.add_boolean_term(entry.idterm) + + self.cache.replace_document(entry.idterm, doc) + def update_cache(self, files=None): """ Update the xapian db with the data in files. If no files have been passed, add everything. + + Return the number of files that have been added to the cache. """ + self.cache = xapian.WritableDatabase( + os.path.join(self.basedir, '.lesana/xapian'), + xapian.DB_CREATE_OR_OPEN + ) + self.indexer = xapian.TermGenerator() + self.indexer.set_stemmer(self.stemmer) if not files: try: files = os.listdir(os.path.join(self.basedir, 'items')) @@ -69,12 +97,15 @@ class Collection(object): logging.warning( "No such file or directory: %s, not updating cache", os.path.join(self.basedir, 'items')) - return False + return 0 + updated = 0 for fname in files: - with open(os.path.join(self.basedir, 'items', fname)) as fp: - if self.safe: - data = ruamel.yaml.safe_load(fp) - else: - data = ruamel.yaml.load(fp, ruamel.yaml.RoundTripLoader) - entry = Entry(self, data) - entry.index() + try: + self._index_file(fname) + except IOError as e: + logging.warning("Could not load file {}: {}".format( + fname, + str(e))) + else: + updated += 1 + return updated diff --git a/tests/test_collection.py b/tests/test_collection.py index d0a9864..b70c85f 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -13,20 +13,20 @@ class testCollectionLoading(unittest.TestCase): def test_empty(self): self.collection = lesana.Collection('tests/data/empty') self.assertEqual(self.collection.schema, {}) - self.assertIsNotNone(self.collection.cache) - self.assertIsNotNone(self.collection.stemmer) self.collection.update_cache() + self.assertIsNotNone(self.collection.cache) + self.assertIsNotNone(self.collection.stemmer) def test_simple(self): self.collection = lesana.Collection('tests/data/simple') self.assertIsNotNone(self.collection.schema) self.assertEqual(self.collection.schema['name'], "Simple lesana collection") self.assertEqual(len(self.collection.schema['fields']), 3) - self.assertIsNotNone(self.collection.cache) - self.assertIsNotNone(self.collection.stemmer) self.collection.update_cache() + self.assertIsNotNone(self.collection.cache) + self.assertIsNotNone(self.collection.stemmer) def test_wrong_language(self): # This loads a collection with an invalid value in lang @@ -35,6 +35,12 @@ class testCollectionLoading(unittest.TestCase): self.assertEqual(len(cm.output), 1) self.assertIn("Invalid language", cm.output[0]) # The collection will default to english, but should still work. + self.collection.update_cache() self.assertIsNotNone(self.collection.schema) self.assertIsNotNone(self.collection.cache) self.assertIsNotNone(self.collection.stemmer) + + def test_unsafe(self): + self.collection = lesana.Collection('tests/data/simple') + self.collection.safe = False + self.collection.update_cache() |