From 16c0bbd6339aadad8b994867ba05a44a0b326a25 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 25 Nov 2020 01:29:38 +0100 Subject: extend tests --- fuzzycat/utils.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'fuzzycat/utils.py') diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index f269b11..5ded48c 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,3 +1,4 @@ +import itertools import io import string @@ -24,3 +25,45 @@ def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True): return parts[f] return func + +def author_similarity_score(u, v): + """ + Given two author strings, return a similarity score between 0 and 1. + """ + return jaccard(set(token_n_grams(u)), set(token_n_grams(v))) + + +def jaccard(a, b): + """ + Jaccard of sets a and b. + """ + if len(a | b) == 0: + return 0 + return len(a & b) / len(a | b) + + +def token_n_grams(s): + """ + Return n-grams, calculated per token. + """ + return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])] + + +def tokenize_string(s): + """ + Normalize and tokenize, should be broken up. + """ + return [token for token in s.lower().split()] + + +def nwise(iterable, n=2): + """ + Generalized: func: `pairwise`. Split an iterable after every + `n` items. + """ + i = iter(iterable) + piece = tuple(itertools.islice(i, n)) + while piece: + yield piece + piece = tuple(itertools.islice(i, n)) + -- cgit v1.2.3