diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-05 23:48:40 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-05 23:48:40 +0100 |
commit | 8c7ba958c87f8b66cd786de8653042875437afa1 (patch) | |
tree | b39cb4b8385fef360e1b611fe5fda173a6acc483 /fuzzycat | |
parent | 6f8277d2b810ca60499dce4431b4d62d0535b78c (diff) | |
download | fuzzycat-8c7ba958c87f8b66cd786de8653042875437afa1.tar.gz fuzzycat-8c7ba958c87f8b66cd786de8653042875437afa1.zip |
define a result type
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/cluster.py | 16 |
1 files changed, 12 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 9a36c78..57b2d3e 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -49,6 +49,14 @@ class KeyDoc(BaseModel): contribs: Optional[List[Contrib]] +class MapResult(BaseModel): + """ + Result of deriving a key from a doc. + """ + id: str + value: str + + get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) @@ -57,7 +65,7 @@ non_word_re = re.compile(r'[\W_]+', re.UNICODE) # it's a jsob blob, with a pydantic spec and schema. -def release_key_title(doc: KeyDoc, get_ident_title=get_ident_title): +def release_key_title(doc: KeyDoc) -> MapResult: id, title = get_ident_title(doc) if not title: raise ValueError('title missing') @@ -65,19 +73,19 @@ def release_key_title(doc: KeyDoc, get_ident_title=get_ident_title): return (id, title) -def release_key_title_normalized(doc: KeyDoc): +def release_key_title_normalized(doc: KeyDoc) -> MapResult: id, title = release_key_title(doc) title = re.sub(r'[ ]{2,}', ' ', title) title = title.lower() return (id, non_word_re.sub('', title)) -def release_key_title_nysiis(doc: KeyDoc): +def release_key_title_nysiis(doc: KeyDoc) -> MapResult: id, title = release_key_title(doc) return (id, fuzzy.nysiis(title)) -def release_key_title_authors_ngram(doc: KeyDoc): +def release_key_title_authors_ngram(doc: KeyDoc) -> MapResult: """ Derive a key from title and authors. Authors in contribs list: |