From 16c0bbd6339aadad8b994867ba05a44a0b326a25 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 25 Nov 2020 01:29:38 +0100
Subject: extend tests

---
 fuzzycat/utils.py  | 43 +++++++++++++++++++++++++++++++++++++++++++
 fuzzycat/verify.py | 46 +++-------------------------------------------
 2 files changed, 46 insertions(+), 43 deletions(-)

(limited to 'fuzzycat')

diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index f269b11..5ded48c 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,3 +1,4 @@
+import itertools
 import io
 import string
 
@@ -24,3 +25,45 @@ def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
         return parts[f]
 
     return func
+
+def author_similarity_score(u, v):
+    """
+    Given two author strings, return a similarity score between 0 and 1.
+    """
+    return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
+
+
+def jaccard(a, b):
+    """
+    Jaccard of sets a and b.
+    """
+    if len(a | b) == 0:
+        return 0
+    return len(a & b) / len(a | b)
+
+
+def token_n_grams(s):
+    """
+    Return n-grams, calculated per token.
+    """
+    return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
+
+
+def tokenize_string(s):
+    """
+    Normalize and tokenize, should be broken up.
+    """
+    return [token for token in s.lower().split()]
+
+
+def nwise(iterable, n=2):
+    """
+    Generalized: func: `pairwise`. Split an iterable after every
+    `n` items.
+    """
+    i = iter(iterable)
+    piece = tuple(itertools.islice(i, n))
+    while piece:
+        yield piece
+        piece = tuple(itertools.islice(i, n))
+
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 7a7f01f..9a0fbd5 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -66,8 +66,10 @@ import re
 import sys
 from enum import Enum
 
-from fuzzycat.cluster import slugify_string
+from fuzzycat.utils import author_similarity_score, slugify_string
 
+# The result of clustering are documents that have a key k and a list of values
+# (of the cluster) v.
 get_key_values = operator.itemgetter("k", "v")
 
 # More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
@@ -364,48 +366,6 @@ def compare(a, b):
     return (Status.AMBIGUOUS, OK.DUMMY)
 
 
-def author_similarity_score(u, v):
-    """
-    Given two author strings, return a similarity score between 0 and 1.
-    """
-    return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
-
-
-def jaccard(a, b):
-    """
-    Jaccard of sets a and b.
-    """
-    if len(a | b) == 0:
-        return 0
-    return len(a & b) / len(a | b)
-
-
-def token_n_grams(s):
-    """
-    Return n-grams, calculated per token.
-    """
-    return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
-
-
-def tokenize_string(s):
-    """
-    Normalize and tokenize, should be broken up.
-    """
-    return [token for token in s.lower().split()]
-
-
-def nwise(iterable, n=2):
-    """
-    Generalized: func: `pairwise`. Split an iterable after every
-    `n` items.
-    """
-    i = iter(iterable)
-    piece = tuple(itertools.islice(i, n))
-    while piece:
-        yield piece
-        piece = tuple(itertools.islice(i, n))
-
-
 def num_project(s):
     """
     Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
-- 
cgit v1.2.3