From 75ee6ba2f3cf9e81faa826fc8e250e68dc5151c6 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Thu, 17 Aug 2017 19:38:22 +0200 Subject: Enable derived collection to set their own entry class --- lesana/collection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lesana/collection.py b/lesana/collection.py index 2ac99aa..738f6ad 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -96,6 +96,7 @@ class Collection(object): # This selects whether to load all other yaml files with # safe_load or load + RoundTripLoader self.safe = False + self.entry_class = Entry def _index_file(self, fname, cache): with open(os.path.join(self.itemdir, fname)) as fp: @@ -103,7 +104,7 @@ class Collection(object): data = ruamel.yaml.safe_load(fp) else: data = ruamel.yaml.load(fp, ruamel.yaml.RoundTripLoader) - entry = Entry(self, data, fname) + entry = self.entry_class(self, data, fname) doc = xapian.Document() self.indexer.set_document(doc) @@ -282,7 +283,7 @@ class Collection(object): doc.get_data(), ruamel.yaml.RoundTripLoader ) - entry = Entry( + entry = self.entry_class( self, data=data, fname=fname, -- cgit v1.2.3 From d4d1b774cd487afaeaf3dee9f9c22e8261503c08 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Thu, 17 Aug 2017 20:00:51 +0200 Subject: Added build-dependency to the README --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index f62f4a1..fe67833 100644 --- a/README.rst +++ b/README.rst @@ -45,6 +45,10 @@ Under debian (and derivatives), the packages to install are:: (some of those are only available on stretch+ because earlier versions lacked python3 support.) +lesana can be run in place from the git checkout / extracted tarball; to +use ``setup.py`` you will also need setuptools (e.g. from the +``python3-setuptools`` package under debian and derivatives). + License ------- -- cgit v1.2.3 From b368e0744d440cb2ba5fde8593325071b8389240 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Thu, 17 Aug 2017 22:27:53 +0200 Subject: Start validating entries before indexing them. --- lesana/collection.py | 54 ++++++++++++++++++++-- .../items/b9a832309c984ada9f267471660c1313.yaml | 5 ++ tests/data/wrong/settings.yaml | 3 ++ tests/test_collection.py | 26 ++++++++++- 4 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml diff --git a/lesana/collection.py b/lesana/collection.py index 738f6ad..8f76ea7 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -64,6 +64,40 @@ class Entry(object): def idterm(self): return "Q"+self.uid + def validate(self): + print("validating", self) + errors = [] + valid = True + for field in self.collection.settings['fields']: + value = self.data.get(field['name'], None) + if not value: + # empty fields are always fine + continue + t = field['type'] + if t == 'integer': + try: + int(value) + except ValueError: + valid = False + errors.append({ + 'field': field['name'], + 'error': 'Invalid value for integer field: {}'.format( + value + ), + }) + elif t == 'float': + try: + float(value) + except ValueError: + valid = False + errors.append({ + 'field': field['name'], + 'error': 'Invalid value for float field: {}'.format( + value + ), + }) + return valid, errors + class Collection(object): """ @@ -104,7 +138,13 @@ class Collection(object): data = ruamel.yaml.safe_load(fp) else: data = ruamel.yaml.load(fp, ruamel.yaml.RoundTripLoader) - entry = self.entry_class(self, data, fname) + entry = self.entry_class(self, data, fname) + valid, errors = entry.validate() + if not valid: + logging.warning( + "Not indexing {fname}: invalid data".format(fname=fname) + ) + return False, errors doc = xapian.Document() self.indexer.set_document(doc) @@ -134,6 +174,7 @@ class Collection(object): doc.add_value(0, entry.fname.encode('utf-8')) cache.replace_document(entry.idterm, doc) + return True, [] @property def indexed_fields(self): @@ -176,14 +217,21 @@ class Collection(object): updated = 0 for fname in fnames: try: - self._index_file(fname, cache) + valid, errors = self._index_file(fname, cache) except IOError as e: logging.warning("Could not load file {}: {}".format( fname, str(e)) ) else: - updated += 1 + if valid: + updated += 1 + else: + logging.warning( + "File {fname} could not be indexed: {errors}".format( + fname=fname, + errors=errors) + ) return updated def save_entries(self, entries=[]): diff --git a/tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml b/tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml new file mode 100644 index 0000000..ec44b7c --- /dev/null +++ b/tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml @@ -0,0 +1,5 @@ +name: 'Problematic entry' +description: | + . +position: 'somewhere' +number: 'four' diff --git a/tests/data/wrong/settings.yaml b/tests/data/wrong/settings.yaml index 9871421..ef9ab74 100644 --- a/tests/data/wrong/settings.yaml +++ b/tests/data/wrong/settings.yaml @@ -9,3 +9,6 @@ fields: - name: position type: string index: facet + - name: number + type: integer + help: "Enter an integer here" diff --git a/tests/test_collection.py b/tests/test_collection.py index 869828a..9de681a 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -54,7 +54,7 @@ class testCollection(unittest.TestCase): self.assertIsNotNone(self.collection.settings) self.assertIsNotNone(self.collection.stemmer) # Fields with no "index" entry are not indexed - self.assertEqual(len(self.collection.settings['fields']), 3) + self.assertEqual(len(self.collection.settings['fields']), 4) self.assertEqual(len(self.collection.indexed_fields), 1) def test_load_safe(self): @@ -240,6 +240,30 @@ class testComplexCollection(unittest.TestCase): self.collection.update_cache() +class testCollectionWithErrors(unittest.TestCase): + @classmethod + def setUpClass(self): + self.collection = lesana.Collection('tests/data/wrong') + + @classmethod + def tearDownClass(self): + shutil.rmtree(os.path.join(self.collection.basedir, '.lesana')) + + def test_init(self): + self.assertIsNotNone(self.collection.settings) + self.assertEqual( + self.collection.settings['name'], + "Lesana collection with certain errors" + ) + self.assertEqual(len(self.collection.settings['fields']), 4) + self.assertIsNotNone(self.collection.stemmer) + self.assertEqual(len(self.collection.indexed_fields), 1) + + def test_index(self): + loaded = self.collection.update_cache() + self.assertEqual(loaded, 0) + + class testCollectionCreation(unittest.TestCase): def test_init(self): tmpdir = tempfile.mkdtemp() -- cgit v1.2.3 From 8e15a0763e3476e333b1e71b70015cf3bd757160 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Sat, 19 Aug 2017 17:12:24 +0200 Subject: Support indexing list fields --- lesana/collection.py | 41 ++++++++++++++-------- .../items/73097121f1874a6ea2f927db7dc4f11e.yaml | 10 ++++++ tests/data/complex/settings.yaml | 7 +++- tests/test_collection.py | 13 +++++-- 4 files changed, 53 insertions(+), 18 deletions(-) create mode 100644 tests/data/complex/items/73097121f1874a6ea2f927db7dc4f11e.yaml diff --git a/lesana/collection.py b/lesana/collection.py index 8f76ea7..cbdd3a7 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -151,24 +151,34 @@ class Collection(object): # Fields with prefix, for field search for field in self.indexed_fields: - try: - self.indexer.index_text( - entry.data.get(field['name']), - 1, - field['prefix']) - except ValueError as e: - logging.info("Not indexing empty? value {}: {}".format( - entry.data.get(field['name']), - str(e))) + if field['multi']: + values = entry.data.get(field['name']) + else: + values = [entry.data.get(field['name'])] + for v in values: + try: + self.indexer.index_text( + v, + 1, + field['prefix']) + except ValueError as e: + logging.info("Not indexing empty? value {}: {}".format( + entry.data.get(field['name']), + str(e))) # unprefixed fields, for full text search for field in self.indexed_fields: if field.get('free_search', False): - try: - self.indexer.index_text(entry.data.get(field['name'])) - self.indexer.increase_termpos() - except ValueError as e: - # probably already logged earlier - pass + if field['multi']: + values = entry.data.get(field['name']) + else: + values = [entry.data.get(field['name'])] + for v in values: + try: + self.indexer.index_text(v) + self.indexer.increase_termpos() + except ValueError as e: + # probably already logged earlier + pass doc.set_data(entry.yaml_data) doc.add_boolean_term(entry.idterm) doc.add_value(0, entry.fname.encode('utf-8')) @@ -186,6 +196,7 @@ class Collection(object): 'prefix': prefix, 'name': field['name'], 'free_search': field['index'] == 'free', + 'multi': field['type'] in ['list'] }) return fields diff --git a/tests/data/complex/items/73097121f1874a6ea2f927db7dc4f11e.yaml b/tests/data/complex/items/73097121f1874a6ea2f927db7dc4f11e.yaml new file mode 100644 index 0000000..1c7070c --- /dev/null +++ b/tests/data/complex/items/73097121f1874a6ea2f927db7dc4f11e.yaml @@ -0,0 +1,10 @@ +name: 'An item' +description: | + multi + line + description +position: 'over there' +something: '' +tags: + - this + - that diff --git a/tests/data/complex/settings.yaml b/tests/data/complex/settings.yaml index 57a1773..bd2179c 100644 --- a/tests/data/complex/settings.yaml +++ b/tests/data/complex/settings.yaml @@ -1,5 +1,6 @@ name: "Fully featured lesana collection" lang: 'english' +entry_label: '{{ uid}}: {{ name }} ({{ tags }})' fields: - name: name type: string @@ -11,6 +12,10 @@ fields: index: free - name: position type: string - index: facet + index: field - name: something type: yaml + - name: tags + type: list + list: string + index: field diff --git a/tests/test_collection.py b/tests/test_collection.py index 9de681a..875a35a 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -232,13 +232,22 @@ class testComplexCollection(unittest.TestCase): self.collection.settings['name'], "Fully featured lesana collection" ) - self.assertEqual(len(self.collection.settings['fields']), 4) + self.assertEqual(len(self.collection.settings['fields']), 5) self.assertIsNotNone(self.collection.stemmer) - self.assertEqual(len(self.collection.indexed_fields), 2) + self.assertEqual(len(self.collection.indexed_fields), 4) def test_index(self): self.collection.update_cache() + def test_indexing_list(self): + self.collection.update_cache(['73097121f1874a6ea2f927db7dc4f11e.yaml']) + self.collection.start_search('tags:this') + res = self.collection.get_search_results() + matches = list(res) + self.assertEqual(len(matches), 1) + for m in matches: + self.assertIsInstance(m, lesana.Entry) + class testCollectionWithErrors(unittest.TestCase): @classmethod -- cgit v1.2.3 From a2c8c2617e3bfc95b4d6c396c56afabeee59dadf Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Sat, 19 Aug 2017 18:17:45 +0200 Subject: Allow empty list fields --- lesana/collection.py | 2 ++ tests/data/complex/items/28b15099c84b41ab892133cd64876a32.yaml | 6 ++++++ 2 files changed, 8 insertions(+) create mode 100644 tests/data/complex/items/28b15099c84b41ab892133cd64876a32.yaml diff --git a/lesana/collection.py b/lesana/collection.py index cbdd3a7..50ed21e 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -155,6 +155,8 @@ class Collection(object): values = entry.data.get(field['name']) else: values = [entry.data.get(field['name'])] + if not values: + values = [] for v in values: try: self.indexer.index_text( diff --git a/tests/data/complex/items/28b15099c84b41ab892133cd64876a32.yaml b/tests/data/complex/items/28b15099c84b41ab892133cd64876a32.yaml new file mode 100644 index 0000000..58b84bb --- /dev/null +++ b/tests/data/complex/items/28b15099c84b41ab892133cd64876a32.yaml @@ -0,0 +1,6 @@ +name: 'A tagless item' +description: | + . +position: 'somewhere' +something: '' +tags: -- cgit v1.2.3 From bbb08c63c39e9e91c6044dcffe71dceba3cbef75 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Sat, 19 Aug 2017 18:30:44 +0200 Subject: Removed debug print --- lesana/collection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lesana/collection.py b/lesana/collection.py index 50ed21e..7c654d8 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -65,7 +65,6 @@ class Entry(object): return "Q"+self.uid def validate(self): - print("validating", self) errors = [] valid = True for field in self.collection.settings['fields']: -- cgit v1.2.3 From 851914e5da250e6a38e83bccf182ab8af1db0d32 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Sat, 19 Aug 2017 18:50:29 +0200 Subject: Enable wildcard searches --- lesana/collection.py | 11 ++++++++++- tests/test_collection.py | 9 +++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lesana/collection.py b/lesana/collection.py index 7c654d8..fbdb935 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -101,6 +101,12 @@ class Entry(object): class Collection(object): """ """ + PARSER_FLAGS = ( + xapian.QueryParser.FLAG_BOOLEAN | + xapian.QueryParser.FLAG_PHRASE | + xapian.QueryParser.FLAG_LOVEHATE | + xapian.QueryParser.FLAG_WILDCARD + ) def __init__(self, directory=None, itemdir='items'): self.basedir = directory or os.getcwd() @@ -296,11 +302,14 @@ class Collection(object): cache = self._get_cache() queryparser = xapian.QueryParser() queryparser.set_stemmer(self.stemmer) + queryparser.set_database(cache) for field in self.indexed_fields: queryparser.add_prefix(field['name'], field['prefix']) - query = queryparser.parse_query(querystring) + query = queryparser.parse_query( + querystring, + self.PARSER_FLAGS) self._enquire = xapian.Enquire(cache) self._enquire.set_query(query) diff --git a/tests/test_collection.py b/tests/test_collection.py index 875a35a..84f0a7a 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -80,6 +80,15 @@ class testCollection(unittest.TestCase): for m in matches: self.assertIsInstance(m, lesana.Entry) + def test_search_wildcard(self): + self.collection = lesana.Collection('tests/data/simple') + self.collection.start_search('Ite*') + res = self.collection.get_search_results() + matches = list(res) + self.assertEqual(len(matches), 2) + for m in matches: + self.assertIsInstance(m, lesana.Entry) + def test_search_non_init(self): self.collection = lesana.Collection('tests/data/simple') matches = list(self.collection.get_search_results()) -- cgit v1.2.3