aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-05 23:48:40 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-05 23:48:40 +0100
commit8c7ba958c87f8b66cd786de8653042875437afa1 (patch)
treeb39cb4b8385fef360e1b611fe5fda173a6acc483
parent6f8277d2b810ca60499dce4431b4d62d0535b78c (diff)
downloadfuzzycat-8c7ba958c87f8b66cd786de8653042875437afa1.tar.gz
fuzzycat-8c7ba958c87f8b66cd786de8653042875437afa1.zip
define a result type
-rw-r--r--fuzzycat/cluster.py16
1 files changed, 12 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 9a36c78..57b2d3e 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -49,6 +49,14 @@ class KeyDoc(BaseModel):
contribs: Optional[List[Contrib]]
+class MapResult(BaseModel):
+ """
+ Result of deriving a key from a doc.
+ """
+ id: str
+ value: str
+
+
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
@@ -57,7 +65,7 @@ non_word_re = re.compile(r'[\W_]+', re.UNICODE)
# it's a jsob blob, with a pydantic spec and schema.
-def release_key_title(doc: KeyDoc, get_ident_title=get_ident_title):
+def release_key_title(doc: KeyDoc) -> MapResult:
id, title = get_ident_title(doc)
if not title:
raise ValueError('title missing')
@@ -65,19 +73,19 @@ def release_key_title(doc: KeyDoc, get_ident_title=get_ident_title):
return (id, title)
-def release_key_title_normalized(doc: KeyDoc):
+def release_key_title_normalized(doc: KeyDoc) -> MapResult:
id, title = release_key_title(doc)
title = re.sub(r'[ ]{2,}', ' ', title)
title = title.lower()
return (id, non_word_re.sub('', title))
-def release_key_title_nysiis(doc: KeyDoc):
+def release_key_title_nysiis(doc: KeyDoc) -> MapResult:
id, title = release_key_title(doc)
return (id, fuzzy.nysiis(title))
-def release_key_title_authors_ngram(doc: KeyDoc):
+def release_key_title_authors_ngram(doc: KeyDoc) -> MapResult:
"""
Derive a key from title and authors. Authors in contribs list: