diff options
-rw-r--r-- | fuzzycat/cluster.py | 3 | ||||
-rw-r--r-- | fuzzycat/common.py | 48 | ||||
-rw-r--r-- | fuzzycat/utils.py | 4 | ||||
-rw-r--r-- | fuzzycat/verify.py | 49 | ||||
-rw-r--r-- | tests/test_utils.py | 37 |
5 files changed, 90 insertions, 51 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index bff80f9..7843577 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -299,6 +299,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]: class Cluster: """ Setup and run clustering over a potentially large (100m) number of records. + + Two main options are iterable (TODO: work on parsed docs), and the key + function to apply to value to group by. """ def __init__(self, iterable: collections.abc.Iterable, diff --git a/fuzzycat/common.py b/fuzzycat/common.py new file mode 100644 index 0000000..7daec7a --- /dev/null +++ b/fuzzycat/common.py @@ -0,0 +1,48 @@ +from enum import Enum + + +class Status(str, Enum): + """ + Match status. + """ + EXACT = 'exact' + DIFFERENT = 'different' + STRONG = 'strong' + WEAK = 'weak' + AMBIGUOUS = 'ambigiuous' + + +class OK(str, Enum): + """ + Reason for assuming we have a match. + """ + ARXIV_VERSION = 'ok.arxiv_version' + FIGSHARE_VERSION = 'ok.figshare_version' + DUMMY = 'ok.dummy' + TITLE_AUTHOR_MATCH = 'ok.title_author_match' + PREPRINT_PUBLISHED = 'ok.preprint_published' + SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match' + TOKENIZED_AUTHORS = 'ok.tokenized_authors' + DATACITE_RELATED_ID = 'ok.datacite_related_id' + + +class Miss(str, Enum): + """ + Reasons indicating mismatch. + """ + ARXIV_VERSION = 'miss.arxiv_version' + BLACKLISTED = 'miss.blacklisted' + BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment' + CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' + SHORT_TITLE = 'miss.short_title' + YEAR = 'miss.year' + CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla + NUM_DIFF = 'miss.num_diff' + DATASET_DOI = 'miss.dataset_doi' + RELEASE_TYPE = 'miss.release_type' + CHEM_FORMULA = 'miss.chem_formula' + SUBTITLE = 'miss.subtitle' + BOOK_CHAPTER = 'miss.book_chapter' + TITLE_FILENAME = 'miss.title_filename' + COMPONENT = 'miss.component' + APPENDIX = 'miss.appendix' diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 56aa467..7a8f067 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -43,11 +43,11 @@ def jaccard(a, b): return len(a & b) / len(a | b) -def token_n_grams(s): +def token_n_grams(s, n=2): """ Return n-grams, calculated per token. """ - return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])] + return ["".join(v) for v in itertools.chain(*[nwise(v, n=n) for v in tokenize_string(s)])] def tokenize_string(s): diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 9a0fbd5..0fb9358 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -64,8 +64,8 @@ import json import operator import re import sys -from enum import Enum +from fuzzycat.common import OK, Miss, Status from fuzzycat.utils import author_similarity_score, slugify_string # The result of clustering are documents that have a key k and a list of values @@ -76,53 +76,6 @@ get_key_values = operator.itemgetter("k", "v") CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") -class Status(str, Enum): - """ - Match status. - """ - EXACT = 'exact' - DIFFERENT = 'different' - STRONG = 'strong' - WEAK = 'weak' - AMBIGUOUS = 'ambigiuous' - - -class OK(str, Enum): - """ - Reason for assuming we have a match. - """ - ARXIV_VERSION = 'ok.arxiv_version' - FIGSHARE_VERSION = 'ok.figshare_version' - DUMMY = 'ok.dummy' - TITLE_AUTHOR_MATCH = 'ok.title_author_match' - PREPRINT_PUBLISHED = 'ok.preprint_published' - SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match' - TOKENIZED_AUTHORS = 'ok.tokenized_authors' - DATACITE_RELATED_ID = 'ok.datacite_related_id' - - -class Miss(str, Enum): - """ - Reasons indicating mismatch. - """ - ARXIV_VERSION = 'miss.arxiv_version' - BLACKLISTED = 'miss.blacklisted' - BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment' - CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' - SHORT_TITLE = 'miss.short_title' - YEAR = 'miss.year' - CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla - NUM_DIFF = 'miss.num_diff' - DATASET_DOI = 'miss.dataset_doi' - RELEASE_TYPE = 'miss.release_type' - CHEM_FORMULA = 'miss.chem_formula' - SUBTITLE = 'miss.subtitle' - BOOK_CHAPTER = 'miss.book_chapter' - TITLE_FILENAME = 'miss.title_filename' - COMPONENT = 'miss.component' - APPENDIX = 'miss.appendix' - - class GroupVerifier: """ Verifier. diff --git a/tests/test_utils.py b/tests/test_utils.py index ea188a4..bc0d918 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ import pytest -from fuzzycat.utils import author_similarity_score, cut, slugify_string +from fuzzycat.utils import author_similarity_score, cut, slugify_string, jaccard, token_n_grams, tokenize_string, nwise def test_slugify_string(): @@ -28,3 +28,38 @@ def test_author_similarity_score(): assert author_similarity_score("", "") == 0.0 assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855 assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375 + + +def test_jaccard(): + assert jaccard(set(), set()) == 0 + assert jaccard(set(["a"]), set()) == 0 + assert jaccard(set(["a"]), set(["a"])) == 1.0 + assert jaccard(set(["a", "b"]), set(["a"])) == 0.5 + assert jaccard(set(["a"]), set(["a", "b"])) == 0.5 + assert jaccard(set(["a", "b", "c"]), set(["a", "c"])) == 2 / 3 + + +def test_token_n_grams(): + assert token_n_grams("") == [] + assert token_n_grams("a") == ["a"] + assert token_n_grams("abc") == ["ab", "c"] + assert token_n_grams("abc", n=3) == ["abc"] + assert token_n_grams("abc", n=1) == ["a", "b", "c"] + assert token_n_grams("abc hello world", n=3) == ["abc", "hel", "lo", "wor", "ld"] + + +def test_tokenize_string(): + assert tokenize_string("") == [] + assert tokenize_string("a") == ["a"] + assert tokenize_string("a b") == ["a", "b"] + assert tokenize_string("a b ") == ["a", "b"] + assert tokenize_string("a b=c") == ["a", "b=c"] + assert tokenize_string("a b 1999") == ["a", "b", "1999"] + assert tokenize_string("a?b*1999") == ["a?b*1999"] + + +def test_nwise(): + assert list(nwise("1234")) == [("1", "2"), ("3", "4")] + assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )] + assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)] + |