diff options
-rw-r--r-- | fuzzycat/utils.py | 43 | ||||
-rw-r--r-- | fuzzycat/verify.py | 46 | ||||
-rw-r--r-- | tests/test_utils.py | 7 |
3 files changed, 52 insertions, 44 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index f269b11..5ded48c 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,3 +1,4 @@ +import itertools import io import string @@ -24,3 +25,45 @@ def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True): return parts[f] return func + +def author_similarity_score(u, v): + """ + Given two author strings, return a similarity score between 0 and 1. + """ + return jaccard(set(token_n_grams(u)), set(token_n_grams(v))) + + +def jaccard(a, b): + """ + Jaccard of sets a and b. + """ + if len(a | b) == 0: + return 0 + return len(a & b) / len(a | b) + + +def token_n_grams(s): + """ + Return n-grams, calculated per token. + """ + return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])] + + +def tokenize_string(s): + """ + Normalize and tokenize, should be broken up. + """ + return [token for token in s.lower().split()] + + +def nwise(iterable, n=2): + """ + Generalized: func: `pairwise`. Split an iterable after every + `n` items. + """ + i = iter(iterable) + piece = tuple(itertools.islice(i, n)) + while piece: + yield piece + piece = tuple(itertools.islice(i, n)) + diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 7a7f01f..9a0fbd5 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -66,8 +66,10 @@ import re import sys from enum import Enum -from fuzzycat.cluster import slugify_string +from fuzzycat.utils import author_similarity_score, slugify_string +# The result of clustering are documents that have a key k and a list of values +# (of the cluster) v. get_key_values = operator.itemgetter("k", "v") # More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ @@ -364,48 +366,6 @@ def compare(a, b): return (Status.AMBIGUOUS, OK.DUMMY) -def author_similarity_score(u, v): - """ - Given two author strings, return a similarity score between 0 and 1. - """ - return jaccard(set(token_n_grams(u)), set(token_n_grams(v))) - - -def jaccard(a, b): - """ - Jaccard of sets a and b. - """ - if len(a | b) == 0: - return 0 - return len(a & b) / len(a | b) - - -def token_n_grams(s): - """ - Return n-grams, calculated per token. - """ - return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])] - - -def tokenize_string(s): - """ - Normalize and tokenize, should be broken up. - """ - return [token for token in s.lower().split()] - - -def nwise(iterable, n=2): - """ - Generalized: func: `pairwise`. Split an iterable after every - `n` items. - """ - i = iter(iterable) - piece = tuple(itertools.islice(i, n)) - while piece: - yield piece - piece = tuple(itertools.islice(i, n)) - - def num_project(s): """ Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq, diff --git a/tests/test_utils.py b/tests/test_utils.py index d0e5d48..1d27e96 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ import pytest -from fuzzycat.utils import slugify_string, cut +from fuzzycat.utils import slugify_string, cut, author_similarity_score def test_slugify_string(): @@ -21,3 +21,8 @@ def test_cut(): assert cut(3, sep=',')("a,b,c") == "" with pytest.raises(ValueError): cut(3, sep=',', ignore_missing_column=False)("a,b,c") == "" + +def test_author_similarity_score(): + assert author_similarity_score("", "") == 0.0 + assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855 + assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375 |