From b5460fe884582cd7c7e6cc4f5b6cd2f1f0af1f86 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 14 Nov 2020 03:51:25 +0100 Subject: wip: verification and tests --- fuzzycat/verify.py | 149 ++++++++++++++++++++++++++++++--------------- tests/test_verify.py | 31 ++++++++++ tests/test_verify/0000.yml | 104 +++++++++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 48 deletions(-) create mode 100644 tests/test_verify.py create mode 100644 tests/test_verify/0000.yml diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 1a0fb95..c937aa8 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -29,12 +29,21 @@ import collections import itertools import json import operator +import sys +from enum import Enum + +from fuzzycat.cluster import slugify_string get_key_values = operator.itemgetter("k", "v") # There titles appear too often, so ignore them for now. TITLE_BLACKLIST = set([ "", + "about this issue", + "about this journal", + "abbreviations and acronyms", + "acknowledgment of reviewers", + "abbildung", "abstracts", "acknowledgements", "acknowledgments", @@ -108,6 +117,39 @@ TITLE_BLACKLIST = set([ ]) +class Status(str, Enum): + """ + Match status. + """ + EXACT = 'exact' + DIFFERENT = 'different' + STRONG = 'strong' + WEAK = 'weak' + AMBIGUOUS = 'ambigiuous' + + +class OK(str, Enum): + """ + Reason for assuming we have a match. + """ + ARXIV_VERSION = 'ok.arxiv_version' + DUMMY = 'ok.dummy' + TITLE_AUTHOR_MATCH = 'ok.title_author_match' + PREPRINT_PUBLISHED = 'ok.preprint_published' + + +class Miss(str, Enum): + """ + Reasons indicating mismatch. + """ + ARXIV_VERSION = 'miss.arxiv_version' + BLACKLISTED = 'miss.blacklisted' + CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' + SHORT_TITLE = 'miss.short_title' + YEAR = 'miss.year' + CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla + + class GroupVerifier: """ Verifier. @@ -122,15 +164,12 @@ class GroupVerifier: def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10): self.iterable: collections.abc.Iterable = iterable self.max_cluster_size: int = 10 - self.counter = collections.Counter({ - "unique": 0, - "too_large": 0, - }) + self.counter = collections.Counter() def run(self): for i, line in enumerate(self.iterable): if i % 20000 == 0: - print(i) + print(i, file=sys.stderr) line = line.strip() if not line: continue @@ -143,46 +182,60 @@ class GroupVerifier: self.counter["too_large"] += 1 continue for a, b in itertools.combinations(vs, r=2): - result = self.compare(a, b) - # print(a.get("ident"), b.get("ident"), result) - # print(a.get("title")[:30], " ---- ", b.get("title")[:20]) - - print(json.dumps(dict(self.counter))) - - def compare(self, a, b): - """ - We compare two release entities here. - - * ext_ids.doi - * contribs - * is the title meaningful enough, is it too common, too short - * files share a sha1 - * arxiv versions - """ - if len(a.get("title")) < 5: - self.counter["short_title"] += 1 - return False - if a.get("title", "").lower() in TITLE_BLACKLIST: - self.counter["blacklist"] += 1 - return False - - arxiv_id_a = a.get("ext_ids", {}).get("arxiv") - arxiv_id_b = b.get("ext_ids", {}).get("arxiv") - if arxiv_id_a and arxiv_id_b: - id_a, version_a = arxiv_id_a.split("v") - id_b, version_b = arxiv_id_b.split("v") - if id_a == id_b: - self.counter["arxiv_v"] += 1 - return True - else: - return False - - a_authors = set([v.get("raw_name") for v in a.get("contribs", [])]) - b_authors = set([v.get("raw_name") for v in b.get("contribs", [])]) - - if len(a_authors & b_authors) == 0: - self.counter["contrib_miss"] += 1 - return False - - self.counter["dummy"] += 1 - return True + result, reason = compare(a, b) + self.counter[reason] += 1 + print("https://fatcat.wiki/release/{}".format(a["ident"]), + "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason) + + self.counter["total"] = sum(v for _, v in self.counter.items()) + print(json.dumps(dict(self.counter)), file=sys.stderr) + + +def compare(a, b): + """ + Compare two entities, return match status. + """ + if len(a.get("title", "")) < 5: + return (Status.AMBIGUOUS, Miss.SHORT_TITLE) + if a.get("title", "").lower() in TITLE_BLACKLIST: + return (Status.AMBIGUOUS, Miss.BLACKLISTED) + + if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"): + return (Status.DIFFERENT, Miss.CUSTOM_VHS) + + arxiv_id_a = a.get("ext_ids", {}).get("arxiv") + arxiv_id_b = b.get("ext_ids", {}).get("arxiv") + + a_authors = set([v.get("raw_name") for v in a.get("contribs", [])]) + b_authors = set([v.get("raw_name") for v in b.get("contribs", [])]) + a_release_year = a.get("release_year") + b_release_year = b.get("release_year") + + if a.get("title") == b.get("title"): + if a_authors and (a_authors == b_authors): + if a_release_year and b_release_year and a_release_year != b_release_year: + return (Status.DIFFERENT, Miss.YEAR) + return (Status.EXACT, OK.TITLE_AUTHOR_MATCH) + + a_slug_title = slugify_string(a.get("title")) + b_slug_title = slugify_string(b.get("title")) + + if a_slug_title and b_slug_title and a_slug_title == b_slug_title: + if a_authors and len(a_authors & b_authors) > 0: + if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None: + return (Status.STRONG, OK.PREPRINT_PUBLISHED) + + arxiv_id_a = a.get("ext_ids", {}).get("arxiv") + arxiv_id_b = b.get("ext_ids", {}).get("arxiv") + if arxiv_id_a and arxiv_id_b: + id_a, version_a = arxiv_id_a.split("v") + id_b, version_b = arxiv_id_b.split("v") + if id_a == id_b: + return (Status.STRONG, OK.ARXIV_VERSION) + else: + return (Status.DIFFERENT, Miss.ARXIV_VERSION) + + if a_authors and len(a_authors & b_authors) == 0: + return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY) + + return (Status.AMBIGUOUS, OK.DUMMY) diff --git a/tests/test_verify.py b/tests/test_verify.py new file mode 100644 index 0000000..be4b0ec --- /dev/null +++ b/tests/test_verify.py @@ -0,0 +1,31 @@ +import operator +import os +import yaml +try: + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader + +from fuzzycat.verify import compare, Status + + +def test_verify_cases(): + """ + Test verification cases, via yaml. + """ + status_map = { + "AMBIGUOUS": Status.AMBIGUOUS, + "DIFFERENT": Status.DIFFERENT, + "EXACT": Status.EXACT, + "STRONG": Status.STRONG, + "WEAK": Status.WEAK, + } + fields = operator.itemgetter("a", "b", "status", "about") + folder = os.path.join(os.path.dirname(__file__), "test_verify") + for root, _, files in os.walk(folder): + for fn in files: + with open(os.path.join(root, fn)) as f: + doc = yaml.load(f, Loader=Loader) + a, b, status, about = fields(doc) + result, _ = compare(a, b) + assert status_map.get(status) == result, about diff --git a/tests/test_verify/0000.yml b/tests/test_verify/0000.yml new file mode 100644 index 0000000..a82b2fe --- /dev/null +++ b/tests/test_verify/0000.yml @@ -0,0 +1,104 @@ +about: Same document should be an exact match. +status: EXACT +a: + abstracts: + - content: Belgium Herbarium image of Meise Botanic Garden. + lang: de + mimetype: text/plain + sha1: cd3c76f5fd94bcf260f9ad74f797d9e79a824b1d + contribs: + - index: 0 + raw_name: Meise Botanic Garden + role: author + ext_ids: + doi: 10.5281/zenodo.2830437 + extra: + datacite: + license: + - rights: Creative Commons Attribution Share Alike 4.0 International + rightsUri: http://creativecommons.org/licenses/by-sa/4.0/legalcode + - rights: Open Access + rightsUri: info:eu-repo/semantics/openAccess + relations: + - relatedIdentifier: 10.5281/zenodo.2830436 + relatedIdentifierType: DOI + relationType: IsVersionOf + - relatedIdentifier: https://zenodo.org/communities/belgiumherbarium + relatedIdentifierType: URL + relationType: IsPartOf + resourceType: Photo + resourceTypeGeneral: Image + subjects: + - subject: Biodiversity + - subject: Taxonomy + - subject: Terrestrial + - subject: Herbarium + - subject: Caryophyllaceae + release_month: 5 + files: [] + filesets: [] + ident: jihezebuzbgxpmsj3356idy52e + license_slug: CC-BY-SA + publisher: Zenodo + refs: [] + release_date: "2019-05-14" + release_stage: published + release_type: graphic + release_year: 2019 + revision: 560ca270-45c5-4f21-89a6-0dfd73039546 + state: active + title: Dianthus carthusianorum L. (BR0000005352692) + webcaptures: [] + work_id: aaaaa34uyngfplcgmoejzjyjne +b: + abstracts: + - content: Belgium Herbarium image of Meise Botanic Garden. + lang: de + mimetype: text/plain + sha1: cd3c76f5fd94bcf260f9ad74f797d9e79a824b1d + contribs: + - index: 0 + raw_name: Meise Botanic Garden + role: author + ext_ids: + doi: 10.5281/zenodo.2830437 + extra: + datacite: + license: + - rights: Creative Commons Attribution Share Alike 4.0 International + rightsUri: http://creativecommons.org/licenses/by-sa/4.0/legalcode + - rights: Open Access + rightsUri: info:eu-repo/semantics/openAccess + relations: + - relatedIdentifier: 10.5281/zenodo.2830436 + relatedIdentifierType: DOI + relationType: IsVersionOf + - relatedIdentifier: https://zenodo.org/communities/belgiumherbarium + relatedIdentifierType: URL + relationType: IsPartOf + resourceType: Photo + resourceTypeGeneral: Image + subjects: + - subject: Biodiversity + - subject: Taxonomy + - subject: Terrestrial + - subject: Herbarium + - subject: Caryophyllaceae + release_month: 5 + files: [] + filesets: [] + ident: jihezebuzbgxpmsj3356idy52e + license_slug: CC-BY-SA + publisher: Zenodo + refs: [] + release_date: "2019-05-14" + release_stage: published + release_type: graphic + release_year: 2019 + revision: 560ca270-45c5-4f21-89a6-0dfd73039546 + state: active + title: Dianthus carthusianorum L. (BR0000005352692) + webcaptures: [] + work_id: aaaaa34uyngfplcgmoejzjyjne + + -- cgit v1.2.3