5 files changed, 90 insertions, 51 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index bff80f9..7843577 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -299,6 +299,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
 class Cluster:
     """
     Setup and run clustering over a potentially large (100m) number of records.
+
+    Two main options are iterable (TODO: work on parsed docs), and the key
+    function to apply to value to group by.
     """
     def __init__(self,
                  iterable: collections.abc.Iterable,
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
new file mode 100644
index 0000000..7daec7a
--- /dev/null
+++ b/fuzzycat/common.py
@@ -0,0 +1,48 @@
+from enum import Enum
+
+
+class Status(str, Enum):
+    """
+    Match status.
+    """
+    EXACT = 'exact'
+    DIFFERENT = 'different'
+    STRONG = 'strong'
+    WEAK = 'weak'
+    AMBIGUOUS = 'ambigiuous'
+
+
+class OK(str, Enum):
+    """
+    Reason for assuming we have a match.
+    """
+    ARXIV_VERSION = 'ok.arxiv_version'
+    FIGSHARE_VERSION = 'ok.figshare_version'
+    DUMMY = 'ok.dummy'
+    TITLE_AUTHOR_MATCH = 'ok.title_author_match'
+    PREPRINT_PUBLISHED = 'ok.preprint_published'
+    SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
+    TOKENIZED_AUTHORS = 'ok.tokenized_authors'
+    DATACITE_RELATED_ID = 'ok.datacite_related_id'
+
+
+class Miss(str, Enum):
+    """
+    Reasons indicating mismatch.
+    """
+    ARXIV_VERSION = 'miss.arxiv_version'
+    BLACKLISTED = 'miss.blacklisted'
+    BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment'
+    CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
+    SHORT_TITLE = 'miss.short_title'
+    YEAR = 'miss.year'
+    CUSTOM_VHS = 'miss.vhs'  # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
+    NUM_DIFF = 'miss.num_diff'
+    DATASET_DOI = 'miss.dataset_doi'
+    RELEASE_TYPE = 'miss.release_type'
+    CHEM_FORMULA = 'miss.chem_formula'
+    SUBTITLE = 'miss.subtitle'
+    BOOK_CHAPTER = 'miss.book_chapter'
+    TITLE_FILENAME = 'miss.title_filename'
+    COMPONENT = 'miss.component'
+    APPENDIX = 'miss.appendix'
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 56aa467..7a8f067 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -43,11 +43,11 @@ def jaccard(a, b):
     return len(a & b) / len(a | b)
 
 
-def token_n_grams(s):
+def token_n_grams(s, n=2):
     """
     Return n-grams, calculated per token.
     """
-    return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
+    return ["".join(v) for v in itertools.chain(*[nwise(v, n=n) for v in tokenize_string(s)])]
 
 
 def tokenize_string(s):
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9a0fbd5..0fb9358 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -64,8 +64,8 @@ import json
 import operator
 import re
 import sys
-from enum import Enum
 
+from fuzzycat.common import OK, Miss, Status
 from fuzzycat.utils import author_similarity_score, slugify_string
 
 # The result of clustering are documents that have a key k and a list of values
@@ -76,53 +76,6 @@ get_key_values = operator.itemgetter("k", "v")
 CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
 
 
-class Status(str, Enum):
-    """
-    Match status.
-    """
-    EXACT = 'exact'
-    DIFFERENT = 'different'
-    STRONG = 'strong'
-    WEAK = 'weak'
-    AMBIGUOUS = 'ambigiuous'
-
-
-class OK(str, Enum):
-    """
-    Reason for assuming we have a match.
-    """
-    ARXIV_VERSION = 'ok.arxiv_version'
-    FIGSHARE_VERSION = 'ok.figshare_version'
-    DUMMY = 'ok.dummy'
-    TITLE_AUTHOR_MATCH = 'ok.title_author_match'
-    PREPRINT_PUBLISHED = 'ok.preprint_published'
-    SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
-    TOKENIZED_AUTHORS = 'ok.tokenized_authors'
-    DATACITE_RELATED_ID = 'ok.datacite_related_id'
-
-
-class Miss(str, Enum):
-    """
-    Reasons indicating mismatch.
-    """
-    ARXIV_VERSION = 'miss.arxiv_version'
-    BLACKLISTED = 'miss.blacklisted'
-    BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment'
-    CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
-    SHORT_TITLE = 'miss.short_title'
-    YEAR = 'miss.year'
-    CUSTOM_VHS = 'miss.vhs'  # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
-    NUM_DIFF = 'miss.num_diff'
-    DATASET_DOI = 'miss.dataset_doi'
-    RELEASE_TYPE = 'miss.release_type'
-    CHEM_FORMULA = 'miss.chem_formula'
-    SUBTITLE = 'miss.subtitle'
-    BOOK_CHAPTER = 'miss.book_chapter'
-    TITLE_FILENAME = 'miss.title_filename'
-    COMPONENT = 'miss.component'
-    APPENDIX = 'miss.appendix'
-
-
 class GroupVerifier:
     """
     Verifier.
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ea188a4..bc0d918 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,6 @@
 import pytest
 
-from fuzzycat.utils import author_similarity_score, cut, slugify_string
+from fuzzycat.utils import author_similarity_score, cut, slugify_string, jaccard, token_n_grams, tokenize_string, nwise
 
 
 def test_slugify_string():
@@ -28,3 +28,38 @@ def test_author_similarity_score():
     assert author_similarity_score("", "") == 0.0
     assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855
     assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375
+
+
+def test_jaccard():
+    assert jaccard(set(), set()) == 0
+    assert jaccard(set(["a"]), set()) == 0
+    assert jaccard(set(["a"]), set(["a"])) == 1.0
+    assert jaccard(set(["a", "b"]), set(["a"])) == 0.5
+    assert jaccard(set(["a"]), set(["a", "b"])) == 0.5
+    assert jaccard(set(["a", "b", "c"]), set(["a", "c"])) == 2 / 3
+
+
+def test_token_n_grams():
+    assert token_n_grams("") == []
+    assert token_n_grams("a") == ["a"]
+    assert token_n_grams("abc") == ["ab", "c"]
+    assert token_n_grams("abc", n=3) == ["abc"]
+    assert token_n_grams("abc", n=1) == ["a", "b", "c"]
+    assert token_n_grams("abc hello world", n=3) == ["abc", "hel", "lo", "wor", "ld"]
+
+
+def test_tokenize_string():
+    assert tokenize_string("") == []
+    assert tokenize_string("a") == ["a"]
+    assert tokenize_string("a b") == ["a", "b"]
+    assert tokenize_string("a  b  ") == ["a", "b"]
+    assert tokenize_string("a b=c") == ["a", "b=c"]
+    assert tokenize_string("a b 1999") == ["a", "b", "1999"]
+    assert tokenize_string("a?b*1999") == ["a?b*1999"]
+
+
+def test_nwise():
+    assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
+    assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
+    assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
+