From b368e0744d440cb2ba5fde8593325071b8389240 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Thu, 17 Aug 2017 22:27:53 +0200 Subject: Start validating entries before indexing them. --- lesana/collection.py | 54 ++++++++++++++++++++-- .../items/b9a832309c984ada9f267471660c1313.yaml | 5 ++ tests/data/wrong/settings.yaml | 3 ++ tests/test_collection.py | 26 ++++++++++- 4 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml diff --git a/lesana/collection.py b/lesana/collection.py index 738f6ad..8f76ea7 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -64,6 +64,40 @@ class Entry(object): def idterm(self): return "Q"+self.uid + def validate(self): + print("validating", self) + errors = [] + valid = True + for field in self.collection.settings['fields']: + value = self.data.get(field['name'], None) + if not value: + # empty fields are always fine + continue + t = field['type'] + if t == 'integer': + try: + int(value) + except ValueError: + valid = False + errors.append({ + 'field': field['name'], + 'error': 'Invalid value for integer field: {}'.format( + value + ), + }) + elif t == 'float': + try: + float(value) + except ValueError: + valid = False + errors.append({ + 'field': field['name'], + 'error': 'Invalid value for float field: {}'.format( + value + ), + }) + return valid, errors + class Collection(object): """ @@ -104,7 +138,13 @@ class Collection(object): data = ruamel.yaml.safe_load(fp) else: data = ruamel.yaml.load(fp, ruamel.yaml.RoundTripLoader) - entry = self.entry_class(self, data, fname) + entry = self.entry_class(self, data, fname) + valid, errors = entry.validate() + if not valid: + logging.warning( + "Not indexing {fname}: invalid data".format(fname=fname) + ) + return False, errors doc = xapian.Document() self.indexer.set_document(doc) @@ -134,6 +174,7 @@ class Collection(object): doc.add_value(0, entry.fname.encode('utf-8')) cache.replace_document(entry.idterm, doc) + return True, [] @property def indexed_fields(self): @@ -176,14 +217,21 @@ class Collection(object): updated = 0 for fname in fnames: try: - self._index_file(fname, cache) + valid, errors = self._index_file(fname, cache) except IOError as e: logging.warning("Could not load file {}: {}".format( fname, str(e)) ) else: - updated += 1 + if valid: + updated += 1 + else: + logging.warning( + "File {fname} could not be indexed: {errors}".format( + fname=fname, + errors=errors) + ) return updated def save_entries(self, entries=[]): diff --git a/tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml b/tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml new file mode 100644 index 0000000..ec44b7c --- /dev/null +++ b/tests/data/wrong/items/b9a832309c984ada9f267471660c1313.yaml @@ -0,0 +1,5 @@ +name: 'Problematic entry' +description: | + . +position: 'somewhere' +number: 'four' diff --git a/tests/data/wrong/settings.yaml b/tests/data/wrong/settings.yaml index 9871421..ef9ab74 100644 --- a/tests/data/wrong/settings.yaml +++ b/tests/data/wrong/settings.yaml @@ -9,3 +9,6 @@ fields: - name: position type: string index: facet + - name: number + type: integer + help: "Enter an integer here" diff --git a/tests/test_collection.py b/tests/test_collection.py index 869828a..9de681a 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -54,7 +54,7 @@ class testCollection(unittest.TestCase): self.assertIsNotNone(self.collection.settings) self.assertIsNotNone(self.collection.stemmer) # Fields with no "index" entry are not indexed - self.assertEqual(len(self.collection.settings['fields']), 3) + self.assertEqual(len(self.collection.settings['fields']), 4) self.assertEqual(len(self.collection.indexed_fields), 1) def test_load_safe(self): @@ -240,6 +240,30 @@ class testComplexCollection(unittest.TestCase): self.collection.update_cache() +class testCollectionWithErrors(unittest.TestCase): + @classmethod + def setUpClass(self): + self.collection = lesana.Collection('tests/data/wrong') + + @classmethod + def tearDownClass(self): + shutil.rmtree(os.path.join(self.collection.basedir, '.lesana')) + + def test_init(self): + self.assertIsNotNone(self.collection.settings) + self.assertEqual( + self.collection.settings['name'], + "Lesana collection with certain errors" + ) + self.assertEqual(len(self.collection.settings['fields']), 4) + self.assertIsNotNone(self.collection.stemmer) + self.assertEqual(len(self.collection.indexed_fields), 1) + + def test_index(self): + loaded = self.collection.update_cache() + self.assertEqual(loaded, 0) + + class testCollectionCreation(unittest.TestCase): def test_init(self): tmpdir = tempfile.mkdtemp() -- cgit v1.2.3