From f556ea9ad72af80c17a456bcc8248e57f2afe0e5 Mon Sep 17 00:00:00 2001 From: Elena ``of Valhalla'' Grandi Date: Thu, 25 Mar 2021 09:51:23 +0100 Subject: Start getting the list and frequency of values for a field. --- lesana/collection.py | 36 ++++++++++++++++++++++++++++++++++++ lesana/types.py | 2 +- tests/data/complex/settings.yaml | 1 + tests/test_collection.py | 19 +++++++++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/lesana/collection.py b/lesana/collection.py index cbef6ee..c148890 100644 --- a/lesana/collection.py +++ b/lesana/collection.py @@ -1,3 +1,4 @@ +import collections import io import logging import os @@ -402,6 +403,41 @@ class Collection(object): doc = cache.get_document(post.docid) yield self._doc_to_entry(doc) + def get_field_values(self, field, querystring='*'): + field = self.fields[field] + if field.field.get('sortable', False): + self.start_search(querystring) + + spy = xapian.ValueCountMatchSpy(field.value_index) + self._enquire.add_matchspy(spy) + + cache = self._get_cache() + self._enquire.get_mset(0, cache.get_doccount()) + + for v in spy.values(): + yield { + 'value': v.term, + 'frequency': v.termfreq, + } + + else: + logging.info( + "Trying to get the list of values for a non sortable field." + ) + logging.info( + "This is going to be pretty inefficient." + ) + values = ( + e.data[field.field['name']] + for e in self.get_all_documents() + ) + counter = collections.Counter(values) + for v in counter.most_common(): + yield { + 'value': v[0], + 'frequency': v[1], + } + def _match_to_entry(self, match): return self._doc_to_entry(match.document) diff --git a/lesana/types.py b/lesana/types.py index 83d12ba..6e5d21d 100644 --- a/lesana/types.py +++ b/lesana/types.py @@ -76,7 +76,7 @@ class LesanaType: doc.add_value(self.value_index, self._to_value(value)) else: logging.debug( - "Index values up to 8 are reserved for internal use" + "Index values up to 15 are reserved for internal use" ) diff --git a/tests/data/complex/settings.yaml b/tests/data/complex/settings.yaml index 7aaf47b..671a9b0 100644 --- a/tests/data/complex/settings.yaml +++ b/tests/data/complex/settings.yaml @@ -15,6 +15,7 @@ fields: - name: position type: string index: field + sortable: true - name: something type: yaml - name: tags diff --git a/tests/test_collection.py b/tests/test_collection.py index 2af4cfe..d05415f 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -285,6 +285,15 @@ class testSimpleCollection(unittest.TestCase): with open(fname, 'r') as fp: self.assertEqual(entry.yaml_data, fp.read()) + def test_list_values(self): + values = self.collection.get_field_values('position') + values = list(values) + self.assertEqual(len(values), 2) + self.assertEqual(values, [ + {'value': 'somewhere', 'frequency': 2}, + {'value': None, 'frequency': 1}, + ]) + class testComplexCollection(unittest.TestCase): def setUp(self): @@ -455,6 +464,16 @@ class testComplexCollection(unittest.TestCase): self.assertEqual(entry.data['updated'].year, now.year) self.assertEqual(entry.data['version'], 2) + def test_list_values(self): + values = self.collection.get_field_values('position') + values = list(values) + self.assertEqual(values, [ + {'value': b'Somewhere', 'frequency': 1}, + {'value': b'over there', 'frequency': 1}, + {'value': b'somewhere', 'frequency': 1}, + {'value': b'there', 'frequency': 1}, + ]) + class testCollectionWithErrors(unittest.TestCase): def setUp(self): -- cgit v1.2.3