aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-14 03:51:25 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-14 03:51:25 +0100
commitb5460fe884582cd7c7e6cc4f5b6cd2f1f0af1f86 (patch)
tree205f326b0d85c2cf6180b9802e1f119a9e5dfd90
parent89b9da699446c9a7566b3a9a444221fe4982058a (diff)
downloadfuzzycat-b5460fe884582cd7c7e6cc4f5b6cd2f1f0af1f86.tar.gz
fuzzycat-b5460fe884582cd7c7e6cc4f5b6cd2f1f0af1f86.zip
wip: verification and tests
-rw-r--r--fuzzycat/verify.py149
-rw-r--r--tests/test_verify.py31
-rw-r--r--tests/test_verify/0000.yml104
3 files changed, 236 insertions, 48 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 1a0fb95..c937aa8 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -29,12 +29,21 @@ import collections
import itertools
import json
import operator
+import sys
+from enum import Enum
+
+from fuzzycat.cluster import slugify_string
get_key_values = operator.itemgetter("k", "v")
# There titles appear too often, so ignore them for now.
TITLE_BLACKLIST = set([
"",
+ "about this issue",
+ "about this journal",
+ "abbreviations and acronyms",
+ "acknowledgment of reviewers",
+ "abbildung",
"abstracts",
"acknowledgements",
"acknowledgments",
@@ -108,6 +117,39 @@ TITLE_BLACKLIST = set([
])
+class Status(str, Enum):
+ """
+ Match status.
+ """
+ EXACT = 'exact'
+ DIFFERENT = 'different'
+ STRONG = 'strong'
+ WEAK = 'weak'
+ AMBIGUOUS = 'ambigiuous'
+
+
+class OK(str, Enum):
+ """
+ Reason for assuming we have a match.
+ """
+ ARXIV_VERSION = 'ok.arxiv_version'
+ DUMMY = 'ok.dummy'
+ TITLE_AUTHOR_MATCH = 'ok.title_author_match'
+ PREPRINT_PUBLISHED = 'ok.preprint_published'
+
+
+class Miss(str, Enum):
+ """
+ Reasons indicating mismatch.
+ """
+ ARXIV_VERSION = 'miss.arxiv_version'
+ BLACKLISTED = 'miss.blacklisted'
+ CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
+ SHORT_TITLE = 'miss.short_title'
+ YEAR = 'miss.year'
+ CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
+
+
class GroupVerifier:
"""
Verifier.
@@ -122,15 +164,12 @@ class GroupVerifier:
def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10):
self.iterable: collections.abc.Iterable = iterable
self.max_cluster_size: int = 10
- self.counter = collections.Counter({
- "unique": 0,
- "too_large": 0,
- })
+ self.counter = collections.Counter()
def run(self):
for i, line in enumerate(self.iterable):
if i % 20000 == 0:
- print(i)
+ print(i, file=sys.stderr)
line = line.strip()
if not line:
continue
@@ -143,46 +182,60 @@ class GroupVerifier:
self.counter["too_large"] += 1
continue
for a, b in itertools.combinations(vs, r=2):
- result = self.compare(a, b)
- # print(a.get("ident"), b.get("ident"), result)
- # print(a.get("title")[:30], " ---- ", b.get("title")[:20])
-
- print(json.dumps(dict(self.counter)))
-
- def compare(self, a, b):
- """
- We compare two release entities here.
-
- * ext_ids.doi
- * contribs
- * is the title meaningful enough, is it too common, too short
- * files share a sha1
- * arxiv versions
- """
- if len(a.get("title")) < 5:
- self.counter["short_title"] += 1
- return False
- if a.get("title", "").lower() in TITLE_BLACKLIST:
- self.counter["blacklist"] += 1
- return False
-
- arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
- arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
- if arxiv_id_a and arxiv_id_b:
- id_a, version_a = arxiv_id_a.split("v")
- id_b, version_b = arxiv_id_b.split("v")
- if id_a == id_b:
- self.counter["arxiv_v"] += 1
- return True
- else:
- return False
-
- a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
- b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
-
- if len(a_authors & b_authors) == 0:
- self.counter["contrib_miss"] += 1
- return False
-
- self.counter["dummy"] += 1
- return True
+ result, reason = compare(a, b)
+ self.counter[reason] += 1
+ print("https://fatcat.wiki/release/{}".format(a["ident"]),
+ "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)
+
+ self.counter["total"] = sum(v for _, v in self.counter.items())
+ print(json.dumps(dict(self.counter)), file=sys.stderr)
+
+
+def compare(a, b):
+ """
+ Compare two entities, return match status.
+ """
+ if len(a.get("title", "")) < 5:
+ return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
+ if a.get("title", "").lower() in TITLE_BLACKLIST:
+ return (Status.AMBIGUOUS, Miss.BLACKLISTED)
+
+ if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):
+ return (Status.DIFFERENT, Miss.CUSTOM_VHS)
+
+ arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+ arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+
+ a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
+ b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
+ a_release_year = a.get("release_year")
+ b_release_year = b.get("release_year")
+
+ if a.get("title") == b.get("title"):
+ if a_authors and (a_authors == b_authors):
+ if a_release_year and b_release_year and a_release_year != b_release_year:
+ return (Status.DIFFERENT, Miss.YEAR)
+ return (Status.EXACT, OK.TITLE_AUTHOR_MATCH)
+
+ a_slug_title = slugify_string(a.get("title"))
+ b_slug_title = slugify_string(b.get("title"))
+
+ if a_slug_title and b_slug_title and a_slug_title == b_slug_title:
+ if a_authors and len(a_authors & b_authors) > 0:
+ if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None:
+ return (Status.STRONG, OK.PREPRINT_PUBLISHED)
+
+ arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+ arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+ if arxiv_id_a and arxiv_id_b:
+ id_a, version_a = arxiv_id_a.split("v")
+ id_b, version_b = arxiv_id_b.split("v")
+ if id_a == id_b:
+ return (Status.STRONG, OK.ARXIV_VERSION)
+ else:
+ return (Status.DIFFERENT, Miss.ARXIV_VERSION)
+
+ if a_authors and len(a_authors & b_authors) == 0:
+ return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY)
+
+ return (Status.AMBIGUOUS, OK.DUMMY)
diff --git a/tests/test_verify.py b/tests/test_verify.py
new file mode 100644
index 0000000..be4b0ec
--- /dev/null
+++ b/tests/test_verify.py
@@ -0,0 +1,31 @@
+import operator
+import os
+import yaml
+try:
+ from yaml import CLoader as Loader
+except ImportError:
+ from yaml import Loader
+
+from fuzzycat.verify import compare, Status
+
+
+def test_verify_cases():
+ """
+ Test verification cases, via yaml.
+ """
+ status_map = {
+ "AMBIGUOUS": Status.AMBIGUOUS,
+ "DIFFERENT": Status.DIFFERENT,
+ "EXACT": Status.EXACT,
+ "STRONG": Status.STRONG,
+ "WEAK": Status.WEAK,
+ }
+ fields = operator.itemgetter("a", "b", "status", "about")
+ folder = os.path.join(os.path.dirname(__file__), "test_verify")
+ for root, _, files in os.walk(folder):
+ for fn in files:
+ with open(os.path.join(root, fn)) as f:
+ doc = yaml.load(f, Loader=Loader)
+ a, b, status, about = fields(doc)
+ result, _ = compare(a, b)
+ assert status_map.get(status) == result, about
diff --git a/tests/test_verify/0000.yml b/tests/test_verify/0000.yml
new file mode 100644
index 0000000..a82b2fe
--- /dev/null
+++ b/tests/test_verify/0000.yml
@@ -0,0 +1,104 @@
+about: Same document should be an exact match.
+status: EXACT
+a:
+ abstracts:
+ - content: Belgium Herbarium image of Meise Botanic Garden.
+ lang: de
+ mimetype: text/plain
+ sha1: cd3c76f5fd94bcf260f9ad74f797d9e79a824b1d
+ contribs:
+ - index: 0
+ raw_name: Meise Botanic Garden
+ role: author
+ ext_ids:
+ doi: 10.5281/zenodo.2830437
+ extra:
+ datacite:
+ license:
+ - rights: Creative Commons Attribution Share Alike 4.0 International
+ rightsUri: http://creativecommons.org/licenses/by-sa/4.0/legalcode
+ - rights: Open Access
+ rightsUri: info:eu-repo/semantics/openAccess
+ relations:
+ - relatedIdentifier: 10.5281/zenodo.2830436
+ relatedIdentifierType: DOI
+ relationType: IsVersionOf
+ - relatedIdentifier: https://zenodo.org/communities/belgiumherbarium
+ relatedIdentifierType: URL
+ relationType: IsPartOf
+ resourceType: Photo
+ resourceTypeGeneral: Image
+ subjects:
+ - subject: Biodiversity
+ - subject: Taxonomy
+ - subject: Terrestrial
+ - subject: Herbarium
+ - subject: Caryophyllaceae
+ release_month: 5
+ files: []
+ filesets: []
+ ident: jihezebuzbgxpmsj3356idy52e
+ license_slug: CC-BY-SA
+ publisher: Zenodo
+ refs: []
+ release_date: "2019-05-14"
+ release_stage: published
+ release_type: graphic
+ release_year: 2019
+ revision: 560ca270-45c5-4f21-89a6-0dfd73039546
+ state: active
+ title: Dianthus carthusianorum L. (BR0000005352692)
+ webcaptures: []
+ work_id: aaaaa34uyngfplcgmoejzjyjne
+b:
+ abstracts:
+ - content: Belgium Herbarium image of Meise Botanic Garden.
+ lang: de
+ mimetype: text/plain
+ sha1: cd3c76f5fd94bcf260f9ad74f797d9e79a824b1d
+ contribs:
+ - index: 0
+ raw_name: Meise Botanic Garden
+ role: author
+ ext_ids:
+ doi: 10.5281/zenodo.2830437
+ extra:
+ datacite:
+ license:
+ - rights: Creative Commons Attribution Share Alike 4.0 International
+ rightsUri: http://creativecommons.org/licenses/by-sa/4.0/legalcode
+ - rights: Open Access
+ rightsUri: info:eu-repo/semantics/openAccess
+ relations:
+ - relatedIdentifier: 10.5281/zenodo.2830436
+ relatedIdentifierType: DOI
+ relationType: IsVersionOf
+ - relatedIdentifier: https://zenodo.org/communities/belgiumherbarium
+ relatedIdentifierType: URL
+ relationType: IsPartOf
+ resourceType: Photo
+ resourceTypeGeneral: Image
+ subjects:
+ - subject: Biodiversity
+ - subject: Taxonomy
+ - subject: Terrestrial
+ - subject: Herbarium
+ - subject: Caryophyllaceae
+ release_month: 5
+ files: []
+ filesets: []
+ ident: jihezebuzbgxpmsj3356idy52e
+ license_slug: CC-BY-SA
+ publisher: Zenodo
+ refs: []
+ release_date: "2019-05-14"
+ release_stage: published
+ release_type: graphic
+ release_year: 2019
+ revision: 560ca270-45c5-4f21-89a6-0dfd73039546
+ state: active
+ title: Dianthus carthusianorum L. (BR0000005352692)
+ webcaptures: []
+ work_id: aaaaa34uyngfplcgmoejzjyjne
+
+