diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-11 02:17:06 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-11 02:17:06 +0100 |
commit | e5bcf8ba46b6851b677078358b7ffd26072c2523 (patch) | |
tree | 0c84c21efdbe6773d5713415fd7e94537c51f1e3 /fuzzycat | |
parent | df70259a6c42fc17245df419fdcdc73f9c7776f1 (diff) | |
download | fuzzycat-e5bcf8ba46b6851b677078358b7ffd26072c2523.tar.gz fuzzycat-e5bcf8ba46b6851b677078358b7ffd26072c2523.zip |
add generic doi version case
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/utils.py | 14 | ||||
-rw-r--r-- | fuzzycat/verify.py | 33 |
2 files changed, 30 insertions, 17 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index ef3b418..2dc2adb 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -3,12 +3,26 @@ import itertools import re import string +from glom import glom, PathAccessError + printable_no_punct = string.digits + string.ascii_letters + string.whitespace # More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") +def dict_key_exists(doc, path): + """ + Return true, if a value at a given path exists. XXX: probably in glom, too. + """ + try: + _ = glom(doc, path) + except PathAccessError: + return False + else: + return True + + def has_doi_prefix(v, prefix="10.1234"): """ Returns False, if we cannot parse v or prefix does not match. diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 9f5aa4f..94e8327 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -77,8 +77,8 @@ from glom import PathAccessError, glom from fuzzycat.common import OK, Miss, Status from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST, TITLE_FRAGMENT_BLACKLIST) -from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix, - jaccard, num_project, slugify_string) +from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists, + has_doi_prefix, jaccard, num_project, slugify_string) # The result of clustering are documents that have a key k and a list of values # (of the cluster) v. @@ -129,7 +129,7 @@ class GroupVerifier: if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST: self.counter["skip.publisher_blacklist"] += 1 continue - result, reason = compare(a, b) + result, reason = verify(a, b) self.counter[reason] += 1 print("https://fatcat.wiki/release/{}".format(a["ident"]), "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason) @@ -137,21 +137,9 @@ class GroupVerifier: self.counter["total"] = sum(v for _, v in self.counter.items()) -def dict_key_exists(doc, path): +def verify(a, b): """ - Return true, if a value at a given path exists. XXX: probably in glom, too. - """ - try: - _ = glom(doc, path) - except PathAccessError: - return False - else: - return True - - -def compare(a, b): - """ - Compare two entities, return match status and reason. + Compare two entities (dicts), return tuple of match status and reason. TODO: We might want a bunch of kwargs for things like year gap threshold and the like. @@ -263,6 +251,17 @@ def compare(a, b): except PathAccessError: pass + # A paper/component pattern. 10.1021/acs.cgd.7b00396, + # https://fatcat.wiki/release/c43itb7esjc3heb64xbohigqge, + # https://fatcat.wiki/release/6kuxfopbcjcrdnhvfokjgbd5wm + try: + a_doi = glom(a, "ext_ids.doi") + b_doi = glom(b, "ext_ids.doi") + if a_doi.split(".")[:-1] == b_doi.split(".") or a_doi.split(".") == b_doi.split(".")[:-1]: + return (Status.STRONG, OK.VERSIONED_DOI) + except PathAccessError: + pass + # TODO: datacite specific vocabulary # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...} # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4 |