diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-26 23:01:26 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-26 23:01:26 +0100 | 
| commit | fd8004910146763a4e8395671d004609024d3700 (patch) | |
| tree | d01182e03605c4e7daeefc5b10d6116410af4a44 | |
| parent | bf929a202373b976504aa362b079fef54ee326a7 (diff) | |
| download | fuzzycat-fd8004910146763a4e8395671d004609024d3700.tar.gz fuzzycat-fd8004910146763a4e8395671d004609024d3700.zip | |
start rewrite of compare
| -rw-r--r-- | fuzzycat/verify.py | 85 | ||||
| -rw-r--r-- | tests/data/verify.csv | 4 | 
2 files changed, 50 insertions, 39 deletions
| diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index ffd0f47..43f011e 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -72,6 +72,8 @@ import operator  import re  import sys +from glom import PathAccessError, glom +  from fuzzycat.common import OK, Miss, Status  from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project,                              slugify_string) @@ -138,63 +140,72 @@ def compare(a, b):      """      Compare two entities, return match status and reason.      """ -    if a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and a.get( -            "ext_ids", {}).get("doi") == b.get("ext_ids", {}).get("doi"): -        return (Status.EXACT, OK.DOI) -    if len(a.get("title", "")) < 5: +    try: +        if glom(a, "ext_ids.doi") == glom(b, "ext_ids.doi"): +            return (Status.EXACT, OK.DOI) +    except PathAccessError: +        pass + +    a_title = a.get("title", "") +    a_title_lower = a_title.lower() +    b_title = b.get("title", "") +    b_title_lower = b_title.lower() + +    if len(a_title) < 5:          return (Status.AMBIGUOUS, Miss.SHORT_TITLE) -    if a.get("title", "").lower() in TITLE_BLACKLIST: +    if a_title_lower in TITLE_BLACKLIST:          return (Status.AMBIGUOUS, Miss.BLACKLISTED)      for fragment in TITLE_FRAGMENT_BLACKLIST: -        if fragment in a.get("title", "").lower(): +        if fragment in a_title_lower:              return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT) -    if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"): +    if "Zweckverband Volkshochschule " in a_title and a_title != b_title:          return (Status.DIFFERENT, Miss.CUSTOM_VHS) -    if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()): +    if re.match(r"appendix ?[^ ]*$", a_title_lower):          return (Status.AMBIGUOUS, Miss.APPENDIX) -    # TODO: figshare versions, "xxx.v1" -    FIGSHARE_PREFIX = "10.6084" -    if a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and a.get( -            "ext_ids", {}).get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get( -                "ext_ids", {}).get("doi").startswith(FIGSHARE_PREFIX + "/"): -        a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("ext_ids", {}).get("doi", "")) -        b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("ext_ids", {}).get("doi", "")) -        if a_doi_v_stripped == b_doi_v_stripped: -            return (Status.STRONG, OK.FIGSHARE_VERSION) +    try: +        # TODO: figshare versions, "xxx.v1" +        FIGSHARE_PREFIX = "10.6084/" +        if glom(a, "ext_ids.doi").startswith(FIGSHARE_PREFIX) and glom( +                b, "ext_ids.doi").startswith(FIGSHARE_PREFIX): +            a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(a, "ext_ids.doi")) +            b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(b, "ext_ids.doi")) +            if a_doi_v_stripped == b_doi_v_stripped: +                return (Status.STRONG, OK.FIGSHARE_VERSION) +    except PathAccessError: +        pass      # TODO: datacite specific vocabulary      # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}      # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4      # TODO: does glom help?      # ... -    if "datacite" in a.get("extra", {}) and "datacite" in b.get("extra", {}): -        # Relevant relationType values: IsSupplementTo, IsSupplementedBy, -        # HasVersion, IsVersionOf, IsNewVersionOf, IsPreviousVersionOf +    if "datacite" in a.get("extra") and "datacite" in b.get("extra"):          whitelist = set([ -            "HasVersion", "IsVersionOf", "IsNewVersionOf", "IsPreviousVersionOf", "IsPartOf", -            "HasPart" +            "HasPart", +            "HasVersion", +            "IsNewVersionOf", +            "IsPartOf", +            "IsPreviousVersionOf", +            "IsVersionOf",          ]) -        def get_related_doi(doc): -            dois = set() -            for rel in doc.get("extra", {}).get("datacite", {}).get("relations", []): -                if rel.get("relationType") not in whitelist: -                    continue -                if rel.get("relatedIdentifierType") != "DOI": -                    continue -                doi = rel.get("relatedIdentifier") -                if not doi: -                    continue -                dois.add(doi) -            return dois +        def get_datacite_related_doi(doc): +            spec = ("extra.datacite.relations", [{ +                "type": "relatedIdentifierType", +                "id": "relatedIdentifier" +            }]) +            try: +                return set([v["id"] for v in glom(doc, spec) if v["type"].lower() == "doi"]) +            except PathAccessError: +                return set() -        a_doi_rel = get_related_doi(a) -        b_doi_rel = get_related_doi(b) -        if b.get("doi") in a_doi_rel or a.get("doi") in b_doi_rel: +        a_doi_rel = get_datacite_related_doi(a) +        b_doi_rel = get_datacite_related_doi(b) +        if glom(b, "ext_ids.doi") in a_doi_rel or glom(a, "ext_ids.doi") in b_doi_rel:              return (Status.STRONG, OK.DATACITE_RELATED_ID)      arxiv_id_a = a.get("ext_ids", {}).get("arxiv") diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 96cec10..af125d2 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -1,4 +1,4 @@ -7kzrmoajzzedxgdvbltgqihszu,bd4crw4p7ber7pzhpoyw2c77bi,Status.STRONG,OK.TITLE_AUTHOR_MATCH +7kzrmoajzzedxgdvbltgqihszu,bd4crw4p7ber7pzhpoyw2c77bi,Status.STRONG,OK.DATACITE_RELATED_ID  foddwpevbjao3b3uwccvtuxfi4,versjalccvgdtp3q25elgy2z7a,Status.DIFFERENT,Miss.DATASET_DOI  v2ypxs2yrbh57cdo6lfuiik64e,6zzx36tlefdtbftzpg4wtump3e,Status.STRONG,OK.ARXIV_VERSION  hdvg6m467bhyng4l7xauk4ymoa,f5fugxp3qze2fht2uxt3xivi4i,Status.STRONG,OK.PREPRINT_PUBLISHED @@ -41,7 +41,7 @@ lynlkp7wh5hn3mlpzcfz4faoqi,yrbvjd4xrjaq3jxt7pkheysclm,Status.DIFFERENT,Miss.YEAR  t3vpox5wrvbgtcigp6a6o64oey,q5yaj5zbzjctzapb5bztzctsoe,Status.DIFFERENT,Miss.YEAR  65qtai5dmjb2hmkwa73nwafyhu,p4lk4tbohjat3g5nn5pb3kjdyu,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY  fqtc2tonfbh7hlcwoxgxzqi4lu,ng7utp7murge3ksuzbtljf5bsq,Status.DIFFERENT,Miss.YEAR -mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.EXACT,OK.TITLE_AUTHOR_MATCH +mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.STRONG,OK.DATACITE_RELATED_ID  nqfv37as6bcohketfrhiuac2mq,ty6megtz35c3hep57bbx2cetja,Status.DIFFERENT,Miss.YEAR  cedhaxcvkrddpeedqtaxln4zsq,5hzpesjrjrdrzaoahvihorp7eq,Status.STRONG,OK.PREPRINT_PUBLISHED  wwiarqhsgbevdc74f6i4qmvyhy,d35gplnuibe6djfhnh42o66zbm,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY | 
