From 8c7ba958c87f8b66cd786de8653042875437afa1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 5 Nov 2020 23:48:40 +0100 Subject: define a result type --- fuzzycat/cluster.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 9a36c78..57b2d3e 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -49,6 +49,14 @@ class KeyDoc(BaseModel): contribs: Optional[List[Contrib]] +class MapResult(BaseModel): + """ + Result of deriving a key from a doc. + """ + id: str + value: str + + get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) @@ -57,7 +65,7 @@ non_word_re = re.compile(r'[\W_]+', re.UNICODE) # it's a jsob blob, with a pydantic spec and schema. -def release_key_title(doc: KeyDoc, get_ident_title=get_ident_title): +def release_key_title(doc: KeyDoc) -> MapResult: id, title = get_ident_title(doc) if not title: raise ValueError('title missing') @@ -65,19 +73,19 @@ def release_key_title(doc: KeyDoc, get_ident_title=get_ident_title): return (id, title) -def release_key_title_normalized(doc: KeyDoc): +def release_key_title_normalized(doc: KeyDoc) -> MapResult: id, title = release_key_title(doc) title = re.sub(r'[ ]{2,}', ' ', title) title = title.lower() return (id, non_word_re.sub('', title)) -def release_key_title_nysiis(doc: KeyDoc): +def release_key_title_nysiis(doc: KeyDoc) -> MapResult: id, title = release_key_title(doc) return (id, fuzzy.nysiis(title)) -def release_key_title_authors_ngram(doc: KeyDoc): +def release_key_title_authors_ngram(doc: KeyDoc) -> MapResult: """ Derive a key from title and authors. Authors in contribs list: -- cgit v1.2.3