aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/cluster.py3
-rw-r--r--fuzzycat/common.py48
-rw-r--r--fuzzycat/utils.py4
-rw-r--r--fuzzycat/verify.py49
-rw-r--r--tests/test_utils.py37
5 files changed, 90 insertions, 51 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index bff80f9..7843577 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -299,6 +299,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
class Cluster:
"""
Setup and run clustering over a potentially large (100m) number of records.
+
+ Two main options are iterable (TODO: work on parsed docs), and the key
+ function to apply to value to group by.
"""
def __init__(self,
iterable: collections.abc.Iterable,
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
new file mode 100644
index 0000000..7daec7a
--- /dev/null
+++ b/fuzzycat/common.py
@@ -0,0 +1,48 @@
+from enum import Enum
+
+
+class Status(str, Enum):
+ """
+ Match status.
+ """
+ EXACT = 'exact'
+ DIFFERENT = 'different'
+ STRONG = 'strong'
+ WEAK = 'weak'
+ AMBIGUOUS = 'ambigiuous'
+
+
+class OK(str, Enum):
+ """
+ Reason for assuming we have a match.
+ """
+ ARXIV_VERSION = 'ok.arxiv_version'
+ FIGSHARE_VERSION = 'ok.figshare_version'
+ DUMMY = 'ok.dummy'
+ TITLE_AUTHOR_MATCH = 'ok.title_author_match'
+ PREPRINT_PUBLISHED = 'ok.preprint_published'
+ SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
+ TOKENIZED_AUTHORS = 'ok.tokenized_authors'
+ DATACITE_RELATED_ID = 'ok.datacite_related_id'
+
+
+class Miss(str, Enum):
+ """
+ Reasons indicating mismatch.
+ """
+ ARXIV_VERSION = 'miss.arxiv_version'
+ BLACKLISTED = 'miss.blacklisted'
+ BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment'
+ CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
+ SHORT_TITLE = 'miss.short_title'
+ YEAR = 'miss.year'
+ CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
+ NUM_DIFF = 'miss.num_diff'
+ DATASET_DOI = 'miss.dataset_doi'
+ RELEASE_TYPE = 'miss.release_type'
+ CHEM_FORMULA = 'miss.chem_formula'
+ SUBTITLE = 'miss.subtitle'
+ BOOK_CHAPTER = 'miss.book_chapter'
+ TITLE_FILENAME = 'miss.title_filename'
+ COMPONENT = 'miss.component'
+ APPENDIX = 'miss.appendix'
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 56aa467..7a8f067 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -43,11 +43,11 @@ def jaccard(a, b):
return len(a & b) / len(a | b)
-def token_n_grams(s):
+def token_n_grams(s, n=2):
"""
Return n-grams, calculated per token.
"""
- return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
+ return ["".join(v) for v in itertools.chain(*[nwise(v, n=n) for v in tokenize_string(s)])]
def tokenize_string(s):
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9a0fbd5..0fb9358 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -64,8 +64,8 @@ import json
import operator
import re
import sys
-from enum import Enum
+from fuzzycat.common import OK, Miss, Status
from fuzzycat.utils import author_similarity_score, slugify_string
# The result of clustering are documents that have a key k and a list of values
@@ -76,53 +76,6 @@ get_key_values = operator.itemgetter("k", "v")
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
-class Status(str, Enum):
- """
- Match status.
- """
- EXACT = 'exact'
- DIFFERENT = 'different'
- STRONG = 'strong'
- WEAK = 'weak'
- AMBIGUOUS = 'ambigiuous'
-
-
-class OK(str, Enum):
- """
- Reason for assuming we have a match.
- """
- ARXIV_VERSION = 'ok.arxiv_version'
- FIGSHARE_VERSION = 'ok.figshare_version'
- DUMMY = 'ok.dummy'
- TITLE_AUTHOR_MATCH = 'ok.title_author_match'
- PREPRINT_PUBLISHED = 'ok.preprint_published'
- SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
- TOKENIZED_AUTHORS = 'ok.tokenized_authors'
- DATACITE_RELATED_ID = 'ok.datacite_related_id'
-
-
-class Miss(str, Enum):
- """
- Reasons indicating mismatch.
- """
- ARXIV_VERSION = 'miss.arxiv_version'
- BLACKLISTED = 'miss.blacklisted'
- BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment'
- CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
- SHORT_TITLE = 'miss.short_title'
- YEAR = 'miss.year'
- CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
- NUM_DIFF = 'miss.num_diff'
- DATASET_DOI = 'miss.dataset_doi'
- RELEASE_TYPE = 'miss.release_type'
- CHEM_FORMULA = 'miss.chem_formula'
- SUBTITLE = 'miss.subtitle'
- BOOK_CHAPTER = 'miss.book_chapter'
- TITLE_FILENAME = 'miss.title_filename'
- COMPONENT = 'miss.component'
- APPENDIX = 'miss.appendix'
-
-
class GroupVerifier:
"""
Verifier.
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ea188a4..bc0d918 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,6 @@
import pytest
-from fuzzycat.utils import author_similarity_score, cut, slugify_string
+from fuzzycat.utils import author_similarity_score, cut, slugify_string, jaccard, token_n_grams, tokenize_string, nwise
def test_slugify_string():
@@ -28,3 +28,38 @@ def test_author_similarity_score():
assert author_similarity_score("", "") == 0.0
assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855
assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375
+
+
+def test_jaccard():
+ assert jaccard(set(), set()) == 0
+ assert jaccard(set(["a"]), set()) == 0
+ assert jaccard(set(["a"]), set(["a"])) == 1.0
+ assert jaccard(set(["a", "b"]), set(["a"])) == 0.5
+ assert jaccard(set(["a"]), set(["a", "b"])) == 0.5
+ assert jaccard(set(["a", "b", "c"]), set(["a", "c"])) == 2 / 3
+
+
+def test_token_n_grams():
+ assert token_n_grams("") == []
+ assert token_n_grams("a") == ["a"]
+ assert token_n_grams("abc") == ["ab", "c"]
+ assert token_n_grams("abc", n=3) == ["abc"]
+ assert token_n_grams("abc", n=1) == ["a", "b", "c"]
+ assert token_n_grams("abc hello world", n=3) == ["abc", "hel", "lo", "wor", "ld"]
+
+
+def test_tokenize_string():
+ assert tokenize_string("") == []
+ assert tokenize_string("a") == ["a"]
+ assert tokenize_string("a b") == ["a", "b"]
+ assert tokenize_string("a b ") == ["a", "b"]
+ assert tokenize_string("a b=c") == ["a", "b=c"]
+ assert tokenize_string("a b 1999") == ["a", "b", "1999"]
+ assert tokenize_string("a?b*1999") == ["a?b*1999"]
+
+
+def test_nwise():
+ assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
+ assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
+ assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
+