aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/utils.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-25 01:29:38 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-25 01:29:38 +0100
commit16c0bbd6339aadad8b994867ba05a44a0b326a25 (patch)
tree2a1d65ea8f293eb8f95e293ca5304aa4aadbf7bd /fuzzycat/utils.py
parent6bf0cb8a908122eed9cccd7f9fae35377a692c1d (diff)
downloadfuzzycat-16c0bbd6339aadad8b994867ba05a44a0b326a25.tar.gz
fuzzycat-16c0bbd6339aadad8b994867ba05a44a0b326a25.zip
extend tests
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r--fuzzycat/utils.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index f269b11..5ded48c 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,3 +1,4 @@
+import itertools
import io
import string
@@ -24,3 +25,45 @@ def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
return parts[f]
return func
+
+def author_similarity_score(u, v):
+ """
+ Given two author strings, return a similarity score between 0 and 1.
+ """
+ return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
+
+
+def jaccard(a, b):
+ """
+ Jaccard of sets a and b.
+ """
+ if len(a | b) == 0:
+ return 0
+ return len(a & b) / len(a | b)
+
+
+def token_n_grams(s):
+ """
+ Return n-grams, calculated per token.
+ """
+ return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
+
+
+def tokenize_string(s):
+ """
+ Normalize and tokenize, should be broken up.
+ """
+ return [token for token in s.lower().split()]
+
+
+def nwise(iterable, n=2):
+ """
+ Generalized: func: `pairwise`. Split an iterable after every
+ `n` items.
+ """
+ i = iter(iterable)
+ piece = tuple(itertools.islice(i, n))
+ while piece:
+ yield piece
+ piece = tuple(itertools.islice(i, n))
+