update docs

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-05 19:30:23 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-05 19:30:23 +0100
commit: 137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4 (patch)
tree: ef9c5ebe9e6da2022501e26e758047217f669b1a
parent: d944a717c57e2cd4a292c091a221ca1da03eb03a (diff)
download: fuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.tar.gz
fuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.zip
2 files changed, 20 insertions, 2 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index f622a44..29b1003 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -31,6 +31,9 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile(r'[\W_]+', re.UNICODE)
 
 
+# Notes: untie from release_entity, as we are only using a few fields. Maybe
+# it's a jsob blob, with a pydantic spec and schema.
+
 def release_key_title(release_entity, get_ident_title=get_ident_title):
     id, title = get_ident_title(release_entity)
     if not title:
@@ -53,7 +56,17 @@ def release_key_title_nysiis(release_entity):
 
 def release_key_title_authors_ngram(release_entity):
     """
-    Derive a key from title and authors.
+    Derive a key from title and authors. Authors in contribs list:
+
+      "contribs": [
+	    {
+	      "index": 0,
+	      "raw_name": "Meise Botanic Garden",
+	      "role": "author"
+	    }
+	],
+
+
     """
     # SS: compare ngram sets?
 
@@ -77,7 +90,9 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
 def group_by(seq, key=None, value=None, comment=""):
     """
     Iterate over lines in filename, group by key (a callable deriving the key
-    from the line), then apply value callable to emit a minimal document.
+    from the line), then apply value callable on the same value to emit a
+    minimal document, containing the key and identifiers belonging to a
+    cluster.
     """
     for k, g in itertools.groupby(seq, key=key):
         doc = {
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 7439e15..05c2218 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -101,6 +101,8 @@ def test_release_key_title_nysiis():
         assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
             case.input)
 
+def test_release_key_title_authors_ngram():
+    pass
 
 def test_sort_by_column():
     with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
@@ -146,3 +148,4 @@ def test_group_by():
 
     for case in cases:
         assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc))
+
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-05 19:30:23 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-05 19:30:23 +0100
commit	137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4 (patch)
tree	ef9c5ebe9e6da2022501e26e758047217f669b1a
parent	d944a717c57e2cd4a292c091a221ca1da03eb03a (diff)
download	fuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.tar.gz fuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.zip