aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-12-11 02:17:06 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-12-11 02:17:06 +0100
commite5bcf8ba46b6851b677078358b7ffd26072c2523 (patch)
tree0c84c21efdbe6773d5713415fd7e94537c51f1e3
parentdf70259a6c42fc17245df419fdcdc73f9c7776f1 (diff)
downloadfuzzycat-e5bcf8ba46b6851b677078358b7ffd26072c2523.tar.gz
fuzzycat-e5bcf8ba46b6851b677078358b7ffd26072c2523.zip
add generic doi version case
-rw-r--r--fuzzycat/utils.py14
-rw-r--r--fuzzycat/verify.py33
-rw-r--r--tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm67
-rw-r--r--tests/data/release/c43itb7esjc3heb64xbohigqge20
-rw-r--r--tests/data/verify.csv3
-rw-r--r--tests/test_verify.py6
6 files changed, 122 insertions, 21 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index ef3b418..2dc2adb 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -3,12 +3,26 @@ import itertools
import re
import string
+from glom import glom, PathAccessError
+
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
+def dict_key_exists(doc, path):
+ """
+ Return true, if a value at a given path exists. XXX: probably in glom, too.
+ """
+ try:
+ _ = glom(doc, path)
+ except PathAccessError:
+ return False
+ else:
+ return True
+
+
def has_doi_prefix(v, prefix="10.1234"):
"""
Returns False, if we cannot parse v or prefix does not match.
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9f5aa4f..94e8327 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -77,8 +77,8 @@ from glom import PathAccessError, glom
from fuzzycat.common import OK, Miss, Status
from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST,
TITLE_FRAGMENT_BLACKLIST)
-from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix,
- jaccard, num_project, slugify_string)
+from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists,
+ has_doi_prefix, jaccard, num_project, slugify_string)
# The result of clustering are documents that have a key k and a list of values
# (of the cluster) v.
@@ -129,7 +129,7 @@ class GroupVerifier:
if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST:
self.counter["skip.publisher_blacklist"] += 1
continue
- result, reason = compare(a, b)
+ result, reason = verify(a, b)
self.counter[reason] += 1
print("https://fatcat.wiki/release/{}".format(a["ident"]),
"https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)
@@ -137,21 +137,9 @@ class GroupVerifier:
self.counter["total"] = sum(v for _, v in self.counter.items())
-def dict_key_exists(doc, path):
+def verify(a, b):
"""
- Return true, if a value at a given path exists. XXX: probably in glom, too.
- """
- try:
- _ = glom(doc, path)
- except PathAccessError:
- return False
- else:
- return True
-
-
-def compare(a, b):
- """
- Compare two entities, return match status and reason.
+ Compare two entities (dicts), return tuple of match status and reason.
TODO: We might want a bunch of kwargs for things like year gap threshold
and the like.
@@ -263,6 +251,17 @@ def compare(a, b):
except PathAccessError:
pass
+ # A paper/component pattern. 10.1021/acs.cgd.7b00396,
+ # https://fatcat.wiki/release/c43itb7esjc3heb64xbohigqge,
+ # https://fatcat.wiki/release/6kuxfopbcjcrdnhvfokjgbd5wm
+ try:
+ a_doi = glom(a, "ext_ids.doi")
+ b_doi = glom(b, "ext_ids.doi")
+ if a_doi.split(".")[:-1] == b_doi.split(".") or a_doi.split(".") == b_doi.split(".")[:-1]:
+ return (Status.STRONG, OK.VERSIONED_DOI)
+ except PathAccessError:
+ pass
+
# TODO: datacite specific vocabulary
# extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}
# beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
diff --git a/tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm b/tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm
new file mode 100644
index 0000000..9e028a8
--- /dev/null
+++ b/tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm
@@ -0,0 +1,67 @@
+{
+ "abstracts": [],
+ "container_id": "tfncqskjxjgbvilbdcq654of3m",
+ "contribs": [
+ {
+ "creator_id": "xf7grzc3gjfg5kbwbgfvppmn5m",
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan",
+ "raw_name": "Tsutomu Shinagawa",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan",
+ "raw_name": "Mitsuru Watanabe",
+ "role": "author"
+ },
+ {
+ "index": 2,
+ "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan",
+ "raw_name": "Jun-ichi Tani",
+ "role": "author"
+ },
+ {
+ "index": 3,
+ "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan",
+ "raw_name": "Masaya Chigane",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1021/acs.cgd.7b00396"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "10.1021/acs.cgd.7b00396"
+ ],
+ "funder": [
+ {
+ "DOI": "10.13039/501100001700",
+ "award": [],
+ "doi-asserted-by": "publisher",
+ "name": "Ministry of Education, Culture, Sports, Science and Technology"
+ }
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "6kuxfopbcjcrdnhvfokjgbd5wm",
+ "language": "en",
+ "pages": "3826-3833",
+ "publisher": "American Chemical Society (ACS)",
+ "refs": [],
+ "release_date": "2017-06-08",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2017,
+ "revision": "feea6f2d-0996-4e3a-bd8d-1c543a228699",
+ "state": "active",
+ "title": "(0001)-Oriented Single-Crystal-Like Porous ZnO on ITO Substrates via Quasi-Topotactic Transformation from (001)-Oriented Zinc Hydroxychloride Crystals",
+ "volume": "17",
+ "work_id": "qc6y573kejhurfijmya7gmymeq"
+}
diff --git a/tests/data/release/c43itb7esjc3heb64xbohigqge b/tests/data/release/c43itb7esjc3heb64xbohigqge
new file mode 100644
index 0000000..7f975ab
--- /dev/null
+++ b/tests/data/release/c43itb7esjc3heb64xbohigqge
@@ -0,0 +1,20 @@
+{
+ "abstracts": [],
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1021/acs.cgd.7b00396.s001"
+ },
+ "extra": {
+ "crossref": {
+ "type": "component"
+ }
+ },
+ "ident": "c43itb7esjc3heb64xbohigqge",
+ "publisher": "American Chemical Society (ACS)",
+ "refs": [],
+ "release_type": "component",
+ "revision": "728aa1ed-533d-4517-9738-384b76ae69b8",
+ "state": "active",
+ "title": "(0001)-Oriented Single-Crystal-Like Porous ZnO on ITO Substrates via Quasi-Topotactic Transformation from (001)-Oriented Zinc Hydroxychloride Crystals",
+ "work_id": "cum2sjlwkbazzbhf43iq3vozuu"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 0439292..d90dc96 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -148,7 +148,7 @@ pobnow7sxfhnxhltgwpru5k7oi,uplqxenmk5axjes6zokml6q73y,Status.DIFFERENT,Miss.RELE
tm3gaiumkvb3xc7t3i6suna6u4,pobnow7sxfhnxhltgwpru5k7oi,Status.DIFFERENT,Miss.RELEASE_TYPE
lqswbciv2vfkzit5zamjaqik6m,zularouecbg5fg4nd6yswxf3s4,Status.DIFFERENT,Miss.JSTOR_ID
j6ipokw3lfflhl2de7afxhac2a,rbgpleyhanakxing2f3234d7xq,Status.AMBIGUOUS,
-bruczmzvnzhtdkd2tf3meg3oou,a7wuehxrv5edpb5265qx27yvmy,Status.AMBIGUOUS,
+bruczmzvnzhtdkd2tf3meg3oou,a7wuehxrv5edpb5265qx27yvmy,Status.STRONG,OK.VERSIONED_DOI
tebqkxnjpzfxnpsqmt5klv2ppm,uqyjav3arngq7bqmzsllxrkpmu,Status.DIFFERENT,
e3fs7ttdbrds3bvsbm7lynzlpu,vpswmj3cgfhktggwvmz33fkwuq,Status.DIFFERENT,
gtsbvudmjzdeppqgzjpmfedycq,27lrseg7jfhxbdxohph7il7a7m,Status.DIFFERENT,Miss.JSTOR_ID
@@ -159,3 +159,4 @@ qnblx3fetbegpe7ryt444dpkke,kokj44xkcfhxvorj7cs7rov2ku,Status.DIFFERENT,Miss.RELE
vrwrf372jbd2vbwcb6fllsvhae,s43ecmng5bbqzcqhxmo7wbfsma,Status.DIFFERENT,Miss.RELEASE_TYPE
4z2amr4cizd2jexlr7uu4jxrsa,nvyd2rotrraelcuchnu6cjbxty,Status.STRONG,OK.PMID_DOI_PAIR
qqsdtxm5hjadta3jf7bgt3bnm4,fupvtkn7t5d5xohffx5bt4yn24,Status.AMBIGUOUS,
+6kuxfopbcjcrdnhvfokjgbd5wm,c43itb7esjc3heb64xbohigqge,Status.STRONG,
diff --git a/tests/test_verify.py b/tests/test_verify.py
index 07808af..79c3143 100644
--- a/tests/test_verify.py
+++ b/tests/test_verify.py
@@ -5,7 +5,7 @@ import os
import pytest
-from fuzzycat.verify import Status, compare
+from fuzzycat.verify import Status, verify
VERIFY_CSV = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/verify.csv")
RELEASE_ENTITIES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/release")
@@ -32,7 +32,7 @@ def load_release_ident(ident):
return json.load(f)
-def test_compare():
+def test_verify():
with open(VERIFY_CSV) as f:
reader = csv.reader(f, delimiter=',')
for i, row in enumerate(reader):
@@ -42,7 +42,7 @@ def test_compare():
pytest.fail(
"invalid test file, maybe too many (or few) commas in row {}? {}".format(
i + 1, exc))
- status, reason = compare(load_release_ident(a), load_release_ident(b))
+ status, reason = verify(load_release_ident(a), load_release_ident(b))
if not expected_status or expected_status.lower() == "todo":
logger.warning(
"skipping test {base}release/{a} {base}release/{b} -- no result defined (we think {status}, {reason})"