aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-26 11:20:35 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-26 11:20:35 +0100
commit47761357e341b7355c98778ec1665ae73e5d6fe3 (patch)
tree14c0af2f9a6cc52bf8fd784d1ad8ab477c9eaef3
parent67589a0bc9217f3259f0f093b5283b9e92828d0f (diff)
downloadfuzzycat-47761357e341b7355c98778ec1665ae73e5d6fe3.tar.gz
fuzzycat-47761357e341b7355c98778ec1665ae73e5d6fe3.zip
figshare fix
-rw-r--r--fuzzycat/utils.py4
-rw-r--r--fuzzycat/verify.py13
-rw-r--r--tests/test_verify.py5
3 files changed, 12 insertions, 10 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 1cac668..4d1325d 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,13 +1,14 @@
import io
import itertools
-import string
import re
+import string
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
+
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
@@ -89,4 +90,3 @@ def contains_chemical_formula(s):
for token in s.split():
if CHEM_FORMULA.search(token):
return True
-
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index d111871..d7b2d62 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -138,7 +138,8 @@ def compare(a, b):
"""
Compare two entities, return match status and reason.
"""
- if a.get("doi") and b.get("doi") and a.get("doi") == b.get("doi"):
+ if a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and a.get(
+ "ext_ids", {}).get("doi") == b.get("ext_ids", {}).get("doi"):
return (Status.EXACT, OK.DOI)
if len(a.get("title", "")) < 5:
return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
@@ -157,10 +158,11 @@ def compare(a, b):
# TODO: figshare versions, "xxx.v1"
FIGSHARE_PREFIX = "10.6084"
- if a.get("doi") and b.get("doi") and a.get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get(
- "doi").startswith(FIGSHARE_PREFIX + "/"):
- a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
- b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
+ if a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and a.get(
+ "ext_ids", {}).get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get(
+ "ext_ids", {}).get("doi").startswith(FIGSHARE_PREFIX + "/"):
+ a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("ext_ids", {}).get("doi", ""))
+ b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("ext_ids", {}).get("doi", ""))
if a_doi_v_stripped == b_doi_v_stripped:
return (Status.STRONG, OK.FIGSHARE_VERSION)
@@ -3542,4 +3544,3 @@ TITLE_BLACKLIST = set([
"週刊ダイヤモンド = diamond weekly 69(1)",
"週刊ダイヤモンド = diamond weekly 別冊",
])
-
diff --git a/tests/test_verify.py b/tests/test_verify.py
index ebbb490..0c04d53 100644
--- a/tests/test_verify.py
+++ b/tests/test_verify.py
@@ -41,11 +41,12 @@ def test_compare():
pytest.fail("invalid test file, maybe missing a comma? {}".format(exc))
status, reason = compare(load_release_ident(a), load_release_ident(b))
if not expected_status or expected_status.lower() == "todo":
- logger.warn(
+ logger.warning(
"skipping test {base}/release/{a} {base}/release/{b} -- no result defined (we think {status}, {reason})"
.format(a=a, b=b, base=FATCAT_BASE_URL, status=status, reason=reason))
assert status == status, "status: want {}, got {} for {} {}".format(
expected_status, status, a, b)
if expected_reason:
- assert expected_reason.lower() == reason.lower(), "reason [{} {}]: want {}, got {}".format(a, b, expected_reason, reason)
+ assert expected_reason.lower() == reason.lower(
+ ), "reason [{} {}]: want {}, got {}".format(a, b, expected_reason, reason)
logger.info("ran verification over {} cases (https://git.io/JkDgS)".format(i))