diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-21 00:13:02 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-21 00:13:02 +0100 |
commit | a5b9e0fee9f97f0205dfc96d35fc4b2cc823554b (patch) | |
tree | 5becd556d6edec553f5670a3bf7c0e94c745858c | |
parent | 668727a06cf2552783cc03af7721f6781b578d56 (diff) | |
download | fuzzycat-a5b9e0fee9f97f0205dfc96d35fc4b2cc823554b.tar.gz fuzzycat-a5b9e0fee9f97f0205dfc96d35fc4b2cc823554b.zip |
wip: datacite, figshare versions
-rw-r--r-- | fuzzycat/verify.py | 41 |
1 files changed, 35 insertions, 6 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 9ed7fdd..609d617 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -84,11 +84,13 @@ class OK(str, Enum): Reason for assuming we have a match. """ ARXIV_VERSION = 'ok.arxiv_version' + FIGSHARE_VERSION = 'ok.figshare_version' DUMMY = 'ok.dummy' TITLE_AUTHOR_MATCH = 'ok.title_author_match' PREPRINT_PUBLISHED = 'ok.preprint_published' SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match' TOKENIZED_AUTHORS = 'ok.tokenized_authors' + DATACITE_RELATED_ID = 'ok.datacite_related_id' class Miss(str, Enum): @@ -186,18 +188,45 @@ def compare(a, b): if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()): return (Status.AMBIGUOUS, Miss.APPENDIX) - # TODO: figshare versions - - if a.get("doi").startswith("10.6084/") and b.get("doi").startswith("10.6084/") + # TODO: figshare versions, "xxx.v1" + FIGSHARE_PREFIX = "10.6084" + if a.get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get("doi").startswith(FIGSHARE_PREFIX + + "/"): + a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi")) + b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi")) + if a_doi_v_stripped == b_doi_v_stripped: + return (Status.STRONG, OK.FIGSHARE_VERSION) # TODO: datacite specific vocabulary # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...} # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4 # TODO: does glom help? # ... - def datacite_relations(doc): - pass - # doc.get("extra", {}).get(" + if "datacite" in a.get("extra") and "datacite" in b.get("extra"): + # Relevant relationType values: IsSupplementTo, IsSupplementedBy, + # HasVersion, IsVersionOf, IsNewVersionOf, IsPreviousVersionOf + whitelist = set([ + "HasVersion", "IsVersionOf", "IsNewVersionOf", "IsPreviousVersionOf", "IsPartOf", + "HasPart" + ]) + + def get_related_doi(doc): + dois = set() + for rel in doc.get("extra", {}).get("datacite", {}).get("relations", []): + if rel.get("relationType") not in whitelist: + continue + if rel.get("relatedIdentifierType") != "DOI": + continue + doi = reg.get("relatedIdentifier") + if not doi: + continue + dois.add(doi) + return dois + + a_doi_rel = get_related_doi(a) + b_doi_rel = get_related_doi(b) + if b.get("doi") in a_doi_rel or a.get("doi") in b_doi_rel: + return (Status.STRONG, OK.DATACITE_RELATED_ID) arxiv_id_a = a.get("ext_ids", {}).get("arxiv") arxiv_id_b = b.get("ext_ids", {}).get("arxiv") |