aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-05 19:30:23 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-05 19:30:23 +0100
commit137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4 (patch)
treeef9c5ebe9e6da2022501e26e758047217f669b1a
parentd944a717c57e2cd4a292c091a221ca1da03eb03a (diff)
downloadfuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.tar.gz
fuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.zip
update docs
-rw-r--r--fuzzycat/cluster.py19
-rw-r--r--tests/test_cluster.py3
2 files changed, 20 insertions, 2 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index f622a44..29b1003 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -31,6 +31,9 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
+# Notes: untie from release_entity, as we are only using a few fields. Maybe
+# it's a jsob blob, with a pydantic spec and schema.
+
def release_key_title(release_entity, get_ident_title=get_ident_title):
id, title = get_ident_title(release_entity)
if not title:
@@ -53,7 +56,17 @@ def release_key_title_nysiis(release_entity):
def release_key_title_authors_ngram(release_entity):
"""
- Derive a key from title and authors.
+ Derive a key from title and authors. Authors in contribs list:
+
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Meise Botanic Garden",
+ "role": "author"
+ }
+ ],
+
+
"""
# SS: compare ngram sets?
@@ -77,7 +90,9 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
def group_by(seq, key=None, value=None, comment=""):
"""
Iterate over lines in filename, group by key (a callable deriving the key
- from the line), then apply value callable to emit a minimal document.
+ from the line), then apply value callable on the same value to emit a
+ minimal document, containing the key and identifiers belonging to a
+ cluster.
"""
for k, g in itertools.groupby(seq, key=key):
doc = {
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 7439e15..05c2218 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -101,6 +101,8 @@ def test_release_key_title_nysiis():
assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
case.input)
+def test_release_key_title_authors_ngram():
+ pass
def test_sort_by_column():
with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
@@ -146,3 +148,4 @@ def test_group_by():
for case in cases:
assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc))
+