aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/utils.py43
-rw-r--r--fuzzycat/verify.py46
-rw-r--r--tests/test_utils.py7
3 files changed, 52 insertions, 44 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index f269b11..5ded48c 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,3 +1,4 @@
+import itertools
import io
import string
@@ -24,3 +25,45 @@ def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
return parts[f]
return func
+
+def author_similarity_score(u, v):
+ """
+ Given two author strings, return a similarity score between 0 and 1.
+ """
+ return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
+
+
+def jaccard(a, b):
+ """
+ Jaccard of sets a and b.
+ """
+ if len(a | b) == 0:
+ return 0
+ return len(a & b) / len(a | b)
+
+
+def token_n_grams(s):
+ """
+ Return n-grams, calculated per token.
+ """
+ return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
+
+
+def tokenize_string(s):
+ """
+ Normalize and tokenize, should be broken up.
+ """
+ return [token for token in s.lower().split()]
+
+
+def nwise(iterable, n=2):
+ """
+ Generalized: func: `pairwise`. Split an iterable after every
+ `n` items.
+ """
+ i = iter(iterable)
+ piece = tuple(itertools.islice(i, n))
+ while piece:
+ yield piece
+ piece = tuple(itertools.islice(i, n))
+
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 7a7f01f..9a0fbd5 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -66,8 +66,10 @@ import re
import sys
from enum import Enum
-from fuzzycat.cluster import slugify_string
+from fuzzycat.utils import author_similarity_score, slugify_string
+# The result of clustering are documents that have a key k and a list of values
+# (of the cluster) v.
get_key_values = operator.itemgetter("k", "v")
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
@@ -364,48 +366,6 @@ def compare(a, b):
return (Status.AMBIGUOUS, OK.DUMMY)
-def author_similarity_score(u, v):
- """
- Given two author strings, return a similarity score between 0 and 1.
- """
- return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
-
-
-def jaccard(a, b):
- """
- Jaccard of sets a and b.
- """
- if len(a | b) == 0:
- return 0
- return len(a & b) / len(a | b)
-
-
-def token_n_grams(s):
- """
- Return n-grams, calculated per token.
- """
- return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
-
-
-def tokenize_string(s):
- """
- Normalize and tokenize, should be broken up.
- """
- return [token for token in s.lower().split()]
-
-
-def nwise(iterable, n=2):
- """
- Generalized: func: `pairwise`. Split an iterable after every
- `n` items.
- """
- i = iter(iterable)
- piece = tuple(itertools.islice(i, n))
- while piece:
- yield piece
- piece = tuple(itertools.islice(i, n))
-
-
def num_project(s):
"""
Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d0e5d48..1d27e96 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,5 +1,5 @@
import pytest
-from fuzzycat.utils import slugify_string, cut
+from fuzzycat.utils import slugify_string, cut, author_similarity_score
def test_slugify_string():
@@ -21,3 +21,8 @@ def test_cut():
assert cut(3, sep=',')("a,b,c") == ""
with pytest.raises(ValueError):
cut(3, sep=',', ignore_missing_column=False)("a,b,c") == ""
+
+def test_author_similarity_score():
+ assert author_similarity_score("", "") == 0.0
+ assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855
+ assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375