aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/verify.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuzzycat/verify.py')
-rw-r--r--fuzzycat/verify.py33
1 files changed, 16 insertions, 17 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9f5aa4f..94e8327 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -77,8 +77,8 @@ from glom import PathAccessError, glom
from fuzzycat.common import OK, Miss, Status
from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST,
TITLE_FRAGMENT_BLACKLIST)
-from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix,
- jaccard, num_project, slugify_string)
+from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists,
+ has_doi_prefix, jaccard, num_project, slugify_string)
# The result of clustering are documents that have a key k and a list of values
# (of the cluster) v.
@@ -129,7 +129,7 @@ class GroupVerifier:
if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST:
self.counter["skip.publisher_blacklist"] += 1
continue
- result, reason = compare(a, b)
+ result, reason = verify(a, b)
self.counter[reason] += 1
print("https://fatcat.wiki/release/{}".format(a["ident"]),
"https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)
@@ -137,21 +137,9 @@ class GroupVerifier:
self.counter["total"] = sum(v for _, v in self.counter.items())
-def dict_key_exists(doc, path):
+def verify(a, b):
"""
- Return true, if a value at a given path exists. XXX: probably in glom, too.
- """
- try:
- _ = glom(doc, path)
- except PathAccessError:
- return False
- else:
- return True
-
-
-def compare(a, b):
- """
- Compare two entities, return match status and reason.
+ Compare two entities (dicts), return tuple of match status and reason.
TODO: We might want a bunch of kwargs for things like year gap threshold
and the like.
@@ -263,6 +251,17 @@ def compare(a, b):
except PathAccessError:
pass
+ # A paper/component pattern. 10.1021/acs.cgd.7b00396,
+ # https://fatcat.wiki/release/c43itb7esjc3heb64xbohigqge,
+ # https://fatcat.wiki/release/6kuxfopbcjcrdnhvfokjgbd5wm
+ try:
+ a_doi = glom(a, "ext_ids.doi")
+ b_doi = glom(b, "ext_ids.doi")
+ if a_doi.split(".")[:-1] == b_doi.split(".") or a_doi.split(".") == b_doi.split(".")[:-1]:
+ return (Status.STRONG, OK.VERSIONED_DOI)
+ except PathAccessError:
+ pass
+
# TODO: datacite specific vocabulary
# extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}
# beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4