diff options
-rw-r--r-- | docs/source/user/settings.rst | 4 | ||||
-rw-r--r-- | lesana/collection.py | 6 | ||||
-rw-r--r-- | lesana/types.py | 49 | ||||
-rw-r--r-- | tests/data/complex/settings.yaml | 2 | ||||
-rw-r--r-- | tests/test_collection.py | 2 | ||||
-rw-r--r-- | tests/test_types.py | 93 |
6 files changed, 147 insertions, 9 deletions
diff --git a/docs/source/user/settings.rst b/docs/source/user/settings.rst index 68c4a0d..0d2eec9 100644 --- a/docs/source/user/settings.rst +++ b/docs/source/user/settings.rst @@ -36,6 +36,10 @@ Field definitions fields that should be available in the free text search and ``field`` for fields that should only be available by specifying the field name in the search. +``sortable``: + boolean; whether this field is sortable. Sortable fields enable + sorting the results and search by ranges, but having too many + sortable fields make the search more resurce intensive. ``help``: a description for the field; this is e.g. added to new entries as a comment. diff --git a/lesana/collection.py b/lesana/collection.py index 77023f7..7a4ff11 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -166,11 +166,15 @@ class Collection(object): for t in self._get_subsubclasses(types.LesanaType): type_loaders[t.name] = t fields = {} - for field in self.settings.get('fields', []): + for i, field in enumerate(self.settings.get('fields', [])): try: fields[field['name']] = type_loaders[field['type']]( field, type_loaders, + # value slot 0 is used to store the filename, and we + # reserve a few more slots just in case they are + # needed by lesana or some derivative + value_index=i + 16, ) except KeyError: # unknown fields are treated as if they were diff --git a/lesana/types.py b/lesana/types.py index b013a48..ce15b69 100644 --- a/lesana/types.py +++ b/lesana/types.py @@ -10,13 +10,16 @@ import logging import dateutil.parser +import xapian + class LesanaType: """ Base class for lesana field types. """ - def __init__(self, field, types): + def __init__(self, field, types, value_index=None): self.field = field + self.value_index = value_index def load(self, data): raise NotImplementedError @@ -24,6 +27,18 @@ class LesanaType: def empty(self): raise NotImplementedError + def _to_index_text(self, value): + """ + Prepare a value for indexing. + """ + return str(value) + + def _to_value(self, value): + """ + Prepare a value for indexing in a value slot + """ + return str(value) + def index(self, doc, indexer, value): """ Index a value for this field type. @@ -35,16 +50,24 @@ class LesanaType: """ to_index = self.field.get('index', False) if not to_index: - return False + return if not value: logging.info( "Not indexing empty value {}".format(value) ) + return prefix = self.field.get('prefix', 'X' + self.field['name'].upper()) - indexer.index_text(str(value), 1, prefix) + indexer.index_text(self._to_index_text(value), 1, prefix) if to_index == 'free': - indexer.index_text(str(value)) + indexer.index_text(self._to_index_text(value)) indexer.increase_termpos() + if self.field.get('sortable', False): + if self.value_index and self.value_index >= 16: + doc.add_value(self.value_index, self._to_value(value)) + else: + logging.debug( + "Index values up to 8 are reserved for internal use" + ) class LesanaString(LesanaType): @@ -88,6 +111,18 @@ class LesanaInt(LesanaType): def empty(self): return 0 + def _to_index_text(self, value): + """ + Prepare a value for indexing. + """ + return str(value) + + def _to_value(self, value): + """ + Prepare a value for indexing in a value slot + """ + return xapian.sortable_serialise(value) + class LesanaFloat(LesanaType): """ @@ -257,8 +292,8 @@ class LesanaList(LesanaType): name = 'list' - def __init__(self, field, types): - super().__init__(field, types) + def __init__(self, field, types, value_index=None): + super().__init__(field, types, value_index) try: self.sub_type = types[field['list']](field, types) except KeyError: @@ -285,7 +320,7 @@ class LesanaList(LesanaType): def index(self, doc, indexer, value): for v in value: - super().index(doc, indexer, v) + self.sub_type.index(doc, indexer, v) class LesanaValueError(ValueError): diff --git a/tests/data/complex/settings.yaml b/tests/data/complex/settings.yaml index 51f313f..e6781b5 100644 --- a/tests/data/complex/settings.yaml +++ b/tests/data/complex/settings.yaml @@ -31,3 +31,5 @@ fields: default: 'default value' - name: amount type: integer + index: field + sortable: true diff --git a/tests/test_collection.py b/tests/test_collection.py index bbc35ba..f3e06da 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -272,7 +272,7 @@ class testComplexCollection(unittest.TestCase): ) self.assertEqual(len(self.collection.settings['fields']), 9) self.assertIsNotNone(self.collection.stemmer) - self.assertEqual(len(self.collection.indexed_fields), 6) + self.assertEqual(len(self.collection.indexed_fields), 7) def test_index(self): self.collection.update_cache() diff --git a/tests/test_types.py b/tests/test_types.py index 90f6482..6f0c33e 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -2,6 +2,8 @@ import datetime import decimal import unittest +import xapian + from lesana import types @@ -308,5 +310,96 @@ class testTypes(unittest.TestCase): checker.load(d) +class testTypeIndexing(unittest.TestCase): + def setUp(self): + self.doc = xapian.Document() + self.indexer = xapian.TermGenerator() + + def _get_field_def(self, type_name): + return { + 'type': type_name, + 'name': 'test_field', + 'index': 'field', + 'sortable': True, + } + + def test_base(self): + checker = types.LesanaType(self._get_field_def('base'), {}, 16) + + checker.index(self.doc, self.indexer, "some string") + + def test_base_value_index_too_low(self): + checker = types.LesanaType(self._get_field_def('base'), {}, 1) + + checker.index(self.doc, self.indexer, "some string") + + # TODO: check that the string has not been indexed + + def test_string(self): + checker = types.LesanaString(self._get_field_def('string'), {}, 16) + + checker.index(self.doc, self.indexer, "some string") + + def test_text(self): + checker = types.LesanaText(self._get_field_def('text'), {}, 16) + + checker.index(self.doc, self.indexer, "some string") + + def test_int(self): + checker = types.LesanaInt(self._get_field_def('integer'), {}, 16) + + checker.index(self.doc, self.indexer, 1) + + def test_float(self): + checker = types.LesanaFloat(self._get_field_def('float'), {}, 16) + + checker.index(self.doc, self.indexer, 1.5) + + def test_decimal(self): + checker = types.LesanaDecimal(self._get_field_def('decimal'), {}, 16) + + checker.index(self.doc, self.indexer, decimal.Decimal('1.0')) + + def test_timestamp(self): + checker = types.LesanaTimestamp( + self._get_field_def('timestamp'), {}, 16 + ) + + checker.index(self.doc, self.indexer, 1600000000) + + def test_datetime(self): + checker = types.LesanaDatetime(self._get_field_def('datetime'), {}, 16) + + checker.index(self.doc, self.indexer, datetime.datetime.now()) + + def test_date(self): + checker = types.LesanaDate(self._get_field_def('date'), {}, 16) + + checker.index(self.doc, self.indexer, datetime.date.today()) + + def test_boolean(self): + checker = types.LesanaBoolean(self._get_field_def('boolean'), {}, 16) + + checker.index(self.doc, self.indexer, True) + + def test_url(self): + checker = types.LesanaURL(self._get_field_def('url'), {}, 16) + + checker.index(self.doc, self.indexer, "http://example.org") + + def test_yaml(self): + checker = types.LesanaYAML(self._get_field_def('yaml'), {}, 16) + + checker.index(self.doc, self.indexer, {'a': 1, 'b': 2}) + + def test_list(self): + field_def = self._get_field_def('yaml') + # we use one type that is easy to check for correct validation + field_def['list'] = 'int' + checker = types.LesanaList(field_def, {'int': types.LesanaInt}, 16) + + checker.index(self.doc, self.indexer, ["some", "thing"]) + + if __name__ == '__main__': unittest.main() |