aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-21 00:13:02 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-21 00:13:02 +0100
commita5b9e0fee9f97f0205dfc96d35fc4b2cc823554b (patch)
tree5becd556d6edec553f5670a3bf7c0e94c745858c
parent668727a06cf2552783cc03af7721f6781b578d56 (diff)
downloadfuzzycat-a5b9e0fee9f97f0205dfc96d35fc4b2cc823554b.tar.gz
fuzzycat-a5b9e0fee9f97f0205dfc96d35fc4b2cc823554b.zip
wip: datacite, figshare versions
-rw-r--r--fuzzycat/verify.py41
1 files changed, 35 insertions, 6 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9ed7fdd..609d617 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -84,11 +84,13 @@ class OK(str, Enum):
Reason for assuming we have a match.
"""
ARXIV_VERSION = 'ok.arxiv_version'
+ FIGSHARE_VERSION = 'ok.figshare_version'
DUMMY = 'ok.dummy'
TITLE_AUTHOR_MATCH = 'ok.title_author_match'
PREPRINT_PUBLISHED = 'ok.preprint_published'
SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
TOKENIZED_AUTHORS = 'ok.tokenized_authors'
+ DATACITE_RELATED_ID = 'ok.datacite_related_id'
class Miss(str, Enum):
@@ -186,18 +188,45 @@ def compare(a, b):
if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()):
return (Status.AMBIGUOUS, Miss.APPENDIX)
- # TODO: figshare versions
-
- if a.get("doi").startswith("10.6084/") and b.get("doi").startswith("10.6084/")
+ # TODO: figshare versions, "xxx.v1"
+ FIGSHARE_PREFIX = "10.6084"
+ if a.get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get("doi").startswith(FIGSHARE_PREFIX +
+ "/"):
+ a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
+ b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
+ if a_doi_v_stripped == b_doi_v_stripped:
+ return (Status.STRONG, OK.FIGSHARE_VERSION)
# TODO: datacite specific vocabulary
# extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}
# beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
# TODO: does glom help?
# ...
- def datacite_relations(doc):
- pass
- # doc.get("extra", {}).get("
+ if "datacite" in a.get("extra") and "datacite" in b.get("extra"):
+ # Relevant relationType values: IsSupplementTo, IsSupplementedBy,
+ # HasVersion, IsVersionOf, IsNewVersionOf, IsPreviousVersionOf
+ whitelist = set([
+ "HasVersion", "IsVersionOf", "IsNewVersionOf", "IsPreviousVersionOf", "IsPartOf",
+ "HasPart"
+ ])
+
+ def get_related_doi(doc):
+ dois = set()
+ for rel in doc.get("extra", {}).get("datacite", {}).get("relations", []):
+ if rel.get("relationType") not in whitelist:
+ continue
+ if rel.get("relatedIdentifierType") != "DOI":
+ continue
+ doi = reg.get("relatedIdentifier")
+ if not doi:
+ continue
+ dois.add(doi)
+ return dois
+
+ a_doi_rel = get_related_doi(a)
+ b_doi_rel = get_related_doi(b)
+ if b.get("doi") in a_doi_rel or a.get("doi") in b_doi_rel:
+ return (Status.STRONG, OK.DATACITE_RELATED_ID)
arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
arxiv_id_b = b.get("ext_ids", {}).get("arxiv")