aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/cluster.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-05 19:30:23 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-05 19:30:23 +0100
commit137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4 (patch)
treeef9c5ebe9e6da2022501e26e758047217f669b1a /fuzzycat/cluster.py
parentd944a717c57e2cd4a292c091a221ca1da03eb03a (diff)
downloadfuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.tar.gz
fuzzycat-137e45cf20e2ce5914eeb815cf8e8d3e59e0acb4.zip
update docs
Diffstat (limited to 'fuzzycat/cluster.py')
-rw-r--r--fuzzycat/cluster.py19
1 files changed, 17 insertions, 2 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index f622a44..29b1003 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -31,6 +31,9 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
+# Notes: untie from release_entity, as we are only using a few fields. Maybe
+# it's a jsob blob, with a pydantic spec and schema.
+
def release_key_title(release_entity, get_ident_title=get_ident_title):
id, title = get_ident_title(release_entity)
if not title:
@@ -53,7 +56,17 @@ def release_key_title_nysiis(release_entity):
def release_key_title_authors_ngram(release_entity):
"""
- Derive a key from title and authors.
+ Derive a key from title and authors. Authors in contribs list:
+
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Meise Botanic Garden",
+ "role": "author"
+ }
+ ],
+
+
"""
# SS: compare ngram sets?
@@ -77,7 +90,9 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
def group_by(seq, key=None, value=None, comment=""):
"""
Iterate over lines in filename, group by key (a callable deriving the key
- from the line), then apply value callable to emit a minimal document.
+ from the line), then apply value callable on the same value to emit a
+ minimal document, containing the key and identifiers belonging to a
+ cluster.
"""
for k, g in itertools.groupby(seq, key=key):
doc = {