From e6363dd992be0f2cf5e3395db5c6d652cee62b3f Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Thu, 22 Dec 2016 19:43:02 +0100 Subject: Add very basic search capabilities --- lesana/collection.py | 91 +++++++++++++++------- lesana/command.py | 42 ++++++++++ scripts/lesana | 3 +- .../085682ed-6792-499d-a3ab-9aebd683c011.yaml | 2 +- tests/test_collection.py | 21 +++-- 5 files changed, 125 insertions(+), 34 deletions(-) diff --git a/lesana/collection.py b/lesana/collection.py index 2e63ee7..3af1cd4 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -19,21 +19,6 @@ class Entry(object): self.uid = uuid.uuid4().hex self.fname = self.uid + '.yaml' - @property - def indexed_fields(self): - fields = [] - for field in self.collection.settings['fields']: - if field['index'] in ['free', 'field']: - prefix = field.get('prefix', 'X'+field['name'].upper()) - fields.append({ - 'value': self.data.get(field['name']), - 'prefix': prefix, - 'name': field['name'], - 'free_search': field['index'] == 'free', - }) - - return fields - def empty_data(self): data = {} for field in self.collection.settings['fields']: @@ -62,7 +47,6 @@ class Collection(object): def __init__(self, directory=None, itemdir='items'): self.basedir = directory or os.getcwd() self.itemdir = os.path.join(self.basedir, itemdir) - self.cache = None try: with open(os.path.join(self.basedir, 'settings.yaml')) as fp: self.settings = ruamel.yaml.load( @@ -87,38 +71,55 @@ class Collection(object): # safe_load or load + RoundTripLoader self.safe = True - def _index_file(self, fname): + def _index_file(self, fname, cache): with open(os.path.join(self.itemdir, fname)) as fp: if self.safe: data = ruamel.yaml.safe_load(fp) else: data = ruamel.yaml.load(fp, ruamel.yaml.RoundTripLoader) - entry = Entry(self, data) + entry = Entry(self, data, fname) doc = xapian.Document() self.indexer.set_document(doc) # Fields with prefix, for field search - for field in entry.indexed_fields: + for field in self.indexed_fields: try: - self.indexer.index_text(field['value'], 1, field['prefix']) + self.indexer.index_text( + entry.data.get(field['name']), + 1, + field['prefix']) except ValueError as e: logging.info("Not indexing empty? value {}: {}".format( - field['value'], + entry.data.get(field['name']), str(e))) # unprefixed fields, for full text search - for field in entry.indexed_fields: + for field in self.indexed_fields: if field.get('free_search', False): try: - self.indexer.index_text(field['value']) + self.indexer.index_text(entry.data.get(field['name'])) self.indexer.increase_termpos() except ValueError as e: # probably already logged earlier pass doc.set_data(entry.yaml_data) doc.add_boolean_term(entry.idterm) + doc.add_value(0, entry.fname) - self.cache.replace_document(entry.idterm, doc) + cache.replace_document(entry.idterm, doc) + + @property + def indexed_fields(self): + fields = [] + for field in self.settings['fields']: + if field['index'] in ['free', 'field']: + prefix = field.get('prefix', 'X'+field['name'].upper()) + fields.append({ + 'prefix': prefix, + 'name': field['name'], + 'free_search': field['index'] == 'free', + }) + return fields def update_cache(self, fnames=None): """ @@ -130,7 +131,7 @@ class Collection(object): Return the number of files that have been added to the cache. """ - self.cache = xapian.WritableDatabase( + cache = xapian.WritableDatabase( os.path.join(self.basedir, '.lesana/xapian'), xapian.DB_CREATE_OR_OPEN ) @@ -148,7 +149,7 @@ class Collection(object): updated = 0 for fname in fnames: try: - self._index_file(fname) + self._index_file(fname, cache) except IOError as e: logging.warning("Could not load file {}: {}".format( fname, @@ -168,3 +169,41 @@ class Collection(object): ) with open(complete_name, 'w') as fp: fp.write(e.yaml_data) + + def search(self, querystring, offset=0, pagesize=12): + try: + cache = xapian.Database( + os.path.join(self.basedir, '.lesana/xapian'), + ) + except xapian.DatabaseOpeningError: + logging.info("No database found, indexing entries.") + self.update_cache() + cache = xapian.Database( + os.path.join(self.basedir, '.lesana/xapian'), + ) + queryparser = xapian.QueryParser() + queryparser.set_stemmer(self.stemmer) + + for field in self.indexed_fields: + queryparser.add_prefix(field['name'], field['prefix']) + + query = queryparser.parse_query(querystring) + + enquire = xapian.Enquire(cache) + enquire.set_query(query) + + for match in enquire.get_mset(offset, pagesize): + fname = match.document.get_value(0) + if self.safe: + data = ruamel.yaml.safe_load(match.document.get_data()) + else: + data = ruamel.yaml.load( + match.document.get_data(), + ruamel.yaml.RoundTripLoader + ) + entry = Entry( + self, + data=data, + fname=fname, + ) + yield entry diff --git a/lesana/command.py b/lesana/command.py index 2f24fa0..7f7f116 100644 --- a/lesana/command.py +++ b/lesana/command.py @@ -54,3 +54,45 @@ class Index(gadona.Command): files = None indexed = collection.update_cache(fnames=files) print("Found and indexed {} entries".format(indexed)) + + +class Search(gadona.Command): + name = 'search' + arguments = [ + (['--collection', '-c'], dict( + help='The collection to work on (default .)' + )), + (['--template', '-t'], dict( + help='Am', + )), + (['--offset'], dict( + )), + (['--pagesize'], dict( + )), + (['query'], dict( + help='Xapian query to search in the collection', + nargs='+' + )), + ] + + def main(self): + # TODO: implement "searching" for everything + if self.settings.offset: + logging.warning( + "offset exposes an internal knob and MAY BE" + + "REMOVED from a future release of lesana" + ) + if self.settings.pagesize: + logging.warning( + "pagesize exposes an internal knob and MAY BE" + + "REMOVED from a future release of lesana" + ) + offset = self.settings.offset or 0 + pagesize = self.settings.pagesize or 12 + collection = Collection(self.settings.collection) + #TODO: pass the entries to a proper template + for entry in collection.search( + ' '.join(self.settings.query), + offset, + pagesize): + print(entry.fname) diff --git a/scripts/lesana b/scripts/lesana index 3886f52..c909888 100755 --- a/scripts/lesana +++ b/scripts/lesana @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import gadona -from lesana.command import New, Index +from lesana.command import New, Index, Search if __name__ == '__main__': app = gadona.App() @@ -9,5 +9,6 @@ if __name__ == '__main__': app.commands = [ New(), Index(), + Search(), ] app.main() diff --git a/tests/data/simple/items/085682ed-6792-499d-a3ab-9aebd683c011.yaml b/tests/data/simple/items/085682ed-6792-499d-a3ab-9aebd683c011.yaml index 16d6917..082128c 100644 --- a/tests/data/simple/items/085682ed-6792-499d-a3ab-9aebd683c011.yaml +++ b/tests/data/simple/items/085682ed-6792-499d-a3ab-9aebd683c011.yaml @@ -1,6 +1,6 @@ name: One Item description: | - This is a long block of test + This is a long block of text that spans multiple lines. position: somewhere uid: 085682ed6792499da3ab9aebd683c011 diff --git a/tests/test_collection.py b/tests/test_collection.py index 68c29f3..ccd535f 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -17,7 +17,6 @@ class testCollectionLoading(unittest.TestCase): self.assertEqual(self.collection.settings, {}) self.collection.update_cache() - self.assertIsNotNone(self.collection.cache) self.assertIsNotNone(self.collection.stemmer) def test_simple(self): @@ -28,9 +27,9 @@ class testCollectionLoading(unittest.TestCase): "Simple lesana collection" ) self.assertEqual(len(self.collection.settings['fields']), 4) + self.assertEqual(len(self.collection.indexed_fields), 2) self.collection.update_cache() - self.assertIsNotNone(self.collection.cache) self.assertIsNotNone(self.collection.stemmer) def test_wrong_language(self): @@ -42,7 +41,6 @@ class testCollectionLoading(unittest.TestCase): # The collection will default to english, but should still work. self.collection.update_cache() self.assertIsNotNone(self.collection.settings) - self.assertIsNotNone(self.collection.cache) self.assertIsNotNone(self.collection.stemmer) def test_unsafe(self): @@ -50,26 +48,37 @@ class testCollectionLoading(unittest.TestCase): self.collection.safe = False self.collection.update_cache() + def test_search(self): + self.collection = lesana.Collection('tests/data/simple') + res = self.collection.search('Item') + matches = list(res) + self.assertEqual(len(matches), 2) + for m in matches: + self.assertIsInstance(m, lesana.Entry) + + class testEntries(unittest.TestCase): def setUp(self): self.collection = lesana.Collection('tests/data/simple') self.basepath = 'tests/data/simple/items' + def tearDown(self): + shutil.rmtree(os.path.join(self.collection.basedir, '.lesana')) + + def test_simple(self): fname = '085682ed-6792-499d-a3ab-9aebd683c011.yaml' with open(os.path.join(self.basepath, fname)) as fp: data = ruamel.yaml.load(fp) entry = lesana.Entry(self.collection, data=data, fname=fname) self.assertEqual(entry.idterm, 'Q'+data['uid']) - self.assertEqual(len(entry.indexed_fields), 2) fname = '11189ee47ddf4796b718a483b379f976.yaml' uid = '11189ee47ddf4796b718a483b379f976' with open(os.path.join(self.basepath, fname)) as fp: data = ruamel.yaml.load(fp) entry = lesana.Entry(self.collection, data=data, fname=fname) self.assertEqual(entry.idterm, 'Q'+uid) - self.assertEqual(len(entry.indexed_fields), 2) def test_write_new(self): new_entry = lesana.Entry(self.collection) @@ -99,7 +108,7 @@ class testComplexCollection(unittest.TestCase): ) self.assertEqual(len(self.collection.settings['fields']), 3) self.assertIsNotNone(self.collection.stemmer) + self.assertEqual(len(self.collection.indexed_fields), 2) def test_index(self): self.collection.update_cache() - self.assertIsNotNone(self.collection.cache) -- cgit v1.2.3