start rewrite of compare

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-26 23:01:26 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-26 23:01:26 +0100
commit: fd8004910146763a4e8395671d004609024d3700 (patch)
tree: d01182e03605c4e7daeefc5b10d6116410af4a44
parent: bf929a202373b976504aa362b079fef54ee326a7 (diff)
download: fuzzycat-fd8004910146763a4e8395671d004609024d3700.tar.gz
fuzzycat-fd8004910146763a4e8395671d004609024d3700.zip
2 files changed, 50 insertions, 39 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index ffd0f47..43f011e 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -72,6 +72,8 @@ import operator
 import re
 import sys
 
+from glom import PathAccessError, glom
+
 from fuzzycat.common import OK, Miss, Status
 from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project,
                             slugify_string)
@@ -138,63 +140,72 @@ def compare(a, b):
     """
     Compare two entities, return match status and reason.
     """
-    if a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and a.get(
-            "ext_ids", {}).get("doi") == b.get("ext_ids", {}).get("doi"):
-        return (Status.EXACT, OK.DOI)
-    if len(a.get("title", "")) < 5:
+    try:
+        if glom(a, "ext_ids.doi") == glom(b, "ext_ids.doi"):
+            return (Status.EXACT, OK.DOI)
+    except PathAccessError:
+        pass
+
+    a_title = a.get("title", "")
+    a_title_lower = a_title.lower()
+    b_title = b.get("title", "")
+    b_title_lower = b_title.lower()
+
+    if len(a_title) < 5:
         return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
-    if a.get("title", "").lower() in TITLE_BLACKLIST:
+    if a_title_lower in TITLE_BLACKLIST:
         return (Status.AMBIGUOUS, Miss.BLACKLISTED)
 
     for fragment in TITLE_FRAGMENT_BLACKLIST:
-        if fragment in a.get("title", "").lower():
+        if fragment in a_title_lower:
             return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT)
 
-    if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):
+    if "Zweckverband Volkshochschule " in a_title and a_title != b_title:
         return (Status.DIFFERENT, Miss.CUSTOM_VHS)
 
-    if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()):
+    if re.match(r"appendix ?[^ ]*$", a_title_lower):
         return (Status.AMBIGUOUS, Miss.APPENDIX)
 
-    # TODO: figshare versions, "xxx.v1"
-    FIGSHARE_PREFIX = "10.6084"
-    if a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and a.get(
-            "ext_ids", {}).get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get(
-                "ext_ids", {}).get("doi").startswith(FIGSHARE_PREFIX + "/"):
-        a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("ext_ids", {}).get("doi", ""))
-        b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("ext_ids", {}).get("doi", ""))
-        if a_doi_v_stripped == b_doi_v_stripped:
-            return (Status.STRONG, OK.FIGSHARE_VERSION)
+    try:
+        # TODO: figshare versions, "xxx.v1"
+        FIGSHARE_PREFIX = "10.6084/"
+        if glom(a, "ext_ids.doi").startswith(FIGSHARE_PREFIX) and glom(
+                b, "ext_ids.doi").startswith(FIGSHARE_PREFIX):
+            a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(a, "ext_ids.doi"))
+            b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(b, "ext_ids.doi"))
+            if a_doi_v_stripped == b_doi_v_stripped:
+                return (Status.STRONG, OK.FIGSHARE_VERSION)
+    except PathAccessError:
+        pass
 
     # TODO: datacite specific vocabulary
     # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}
     # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
     # TODO: does glom help?
     # ...
-    if "datacite" in a.get("extra", {}) and "datacite" in b.get("extra", {}):
-        # Relevant relationType values: IsSupplementTo, IsSupplementedBy,
-        # HasVersion, IsVersionOf, IsNewVersionOf, IsPreviousVersionOf
+    if "datacite" in a.get("extra") and "datacite" in b.get("extra"):
         whitelist = set([
-            "HasVersion", "IsVersionOf", "IsNewVersionOf", "IsPreviousVersionOf", "IsPartOf",
-            "HasPart"
+            "HasPart",
+            "HasVersion",
+            "IsNewVersionOf",
+            "IsPartOf",
+            "IsPreviousVersionOf",
+            "IsVersionOf",
         ])
 
-        def get_related_doi(doc):
-            dois = set()
-            for rel in doc.get("extra", {}).get("datacite", {}).get("relations", []):
-                if rel.get("relationType") not in whitelist:
-                    continue
-                if rel.get("relatedIdentifierType") != "DOI":
-                    continue
-                doi = rel.get("relatedIdentifier")
-                if not doi:
-                    continue
-                dois.add(doi)
-            return dois
+        def get_datacite_related_doi(doc):
+            spec = ("extra.datacite.relations", [{
+                "type": "relatedIdentifierType",
+                "id": "relatedIdentifier"
+            }])
+            try:
+                return set([v["id"] for v in glom(doc, spec) if v["type"].lower() == "doi"])
+            except PathAccessError:
+                return set()
 
-        a_doi_rel = get_related_doi(a)
-        b_doi_rel = get_related_doi(b)
-        if b.get("doi") in a_doi_rel or a.get("doi") in b_doi_rel:
+        a_doi_rel = get_datacite_related_doi(a)
+        b_doi_rel = get_datacite_related_doi(b)
+        if glom(b, "ext_ids.doi") in a_doi_rel or glom(a, "ext_ids.doi") in b_doi_rel:
             return (Status.STRONG, OK.DATACITE_RELATED_ID)
 
     arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 96cec10..af125d2 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -1,4 +1,4 @@
-7kzrmoajzzedxgdvbltgqihszu,bd4crw4p7ber7pzhpoyw2c77bi,Status.STRONG,OK.TITLE_AUTHOR_MATCH
+7kzrmoajzzedxgdvbltgqihszu,bd4crw4p7ber7pzhpoyw2c77bi,Status.STRONG,OK.DATACITE_RELATED_ID
 foddwpevbjao3b3uwccvtuxfi4,versjalccvgdtp3q25elgy2z7a,Status.DIFFERENT,Miss.DATASET_DOI
 v2ypxs2yrbh57cdo6lfuiik64e,6zzx36tlefdtbftzpg4wtump3e,Status.STRONG,OK.ARXIV_VERSION
 hdvg6m467bhyng4l7xauk4ymoa,f5fugxp3qze2fht2uxt3xivi4i,Status.STRONG,OK.PREPRINT_PUBLISHED
@@ -41,7 +41,7 @@ lynlkp7wh5hn3mlpzcfz4faoqi,yrbvjd4xrjaq3jxt7pkheysclm,Status.DIFFERENT,Miss.YEAR
 t3vpox5wrvbgtcigp6a6o64oey,q5yaj5zbzjctzapb5bztzctsoe,Status.DIFFERENT,Miss.YEAR
 65qtai5dmjb2hmkwa73nwafyhu,p4lk4tbohjat3g5nn5pb3kjdyu,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
 fqtc2tonfbh7hlcwoxgxzqi4lu,ng7utp7murge3ksuzbtljf5bsq,Status.DIFFERENT,Miss.YEAR
-mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.EXACT,OK.TITLE_AUTHOR_MATCH
+mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.STRONG,OK.DATACITE_RELATED_ID
 nqfv37as6bcohketfrhiuac2mq,ty6megtz35c3hep57bbx2cetja,Status.DIFFERENT,Miss.YEAR
 cedhaxcvkrddpeedqtaxln4zsq,5hzpesjrjrdrzaoahvihorp7eq,Status.STRONG,OK.PREPRINT_PUBLISHED
 wwiarqhsgbevdc74f6i4qmvyhy,d35gplnuibe6djfhnh42o66zbm,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-26 23:01:26 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-26 23:01:26 +0100
commit	fd8004910146763a4e8395671d004609024d3700 (patch)
tree	d01182e03605c4e7daeefc5b10d6116410af4a44
parent	bf929a202373b976504aa362b079fef54ee326a7 (diff)
download	fuzzycat-fd8004910146763a4e8395671d004609024d3700.tar.gz fuzzycat-fd8004910146763a4e8395671d004609024d3700.zip