From 72079139edd2e95a8d9e19dff5590771fee52202 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 1 Dec 2020 23:54:02 +0100 Subject: verify: bsi undated --- fuzzycat/common.py | 1 + fuzzycat/utils.py | 9 +++++++++ fuzzycat/verify.py | 22 +++++++++++++--------- notes/todo.md | 6 +++++- tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy | 23 +++++++++++++++++++++++ tests/data/release/neznj5fb4nf3tdqnotnbe34b6e | 23 +++++++++++++++++++++++ tests/data/verify.csv | 3 ++- 7 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy create mode 100644 tests/data/release/neznj5fb4nf3tdqnotnbe34b6e diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 2298185..34508b7 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -26,6 +26,7 @@ class OK(str, Enum): TITLE_AUTHOR_MATCH = 'ok.title_author_match' TOKENIZED_AUTHORS = 'ok.tokenized_authors' CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv' + CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated' class Miss(str, Enum): diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index d6beb03..ef3b418 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -9,6 +9,15 @@ printable_no_punct = string.digits + string.ascii_letters + string.whitespace CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") +def has_doi_prefix(v, prefix="10.1234"): + """ + Returns False, if we cannot parse v or prefix does not match. + """ + if not v: + return False + return v.split("/")[0] == prefix + + def slugify_string(s: str) -> str: """ Keeps ascii chars and single whitespace only. diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 84e17d8..e688d49 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -75,8 +75,8 @@ import sys from glom import PathAccessError, glom from fuzzycat.common import OK, Miss, Status -from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project, - slugify_string) +from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix, + num_project, slugify_string) # The result of clustering are documents that have a key k and a list of values # (of the cluster) v. @@ -171,6 +171,15 @@ def compare(a, b): except PathAccessError: pass + try: + a_doi = glom(a, "ext_ids.doi") + b_doi = glom(b, "ext_ids.doi") + if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"): + if a_doi + "u" == b_doi or b_doi + "u" == a_doi: + return (Status.STRONG, OK.CUSTOM_BSI_UNDATED) + except PathAccessError: + pass + if "Zweckverband Volkshochschule " in a_title and a_title != b_title: return (Status.DIFFERENT, Miss.CUSTOM_VHS) @@ -275,13 +284,6 @@ def compare(a, b): a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ") b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ") - try: - if glom(a, "ext_ids.doi") == "10.1109/nssmic.2013.6829591": - print(a_slug_title) - print(b_slug_title) - except PathAccessError: - pass - if a_slug_title == b_slug_title: # via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi # https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily @@ -294,6 +296,8 @@ def compare(a, b): except PathAccessError: pass + # TODO: we might want to have some light python DSL to express these + # (commute) things result = ieee_arxiv_pair_check(a, b) if result: return result diff --git a/notes/todo.md b/notes/todo.md index aeaf6f0..107544d 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -36,11 +36,15 @@ STKE "fulltext" link does not lead anywhere; discontinued. > book vs article-journal -* [ ] https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily +* [x] https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily > preprint and IEEE published article * [ ] https://fatcat.wiki/release/neznj5fb4nf3tdqnotnbe34b6e https://fatcat.wiki/release/gcqdvvjiq5bphl7lpc4invi4vy + +> a standard document; DOI and DOIu -- which means "undated" (as per URL) -- +> https://landingpage.bsigroup.com/LandingPage/Undated?UPI=000000000030281171 + * [ ] https://fatcat.wiki/release/fmi7hmpb3beotnj5kfyjjkolcy https://fatcat.wiki/release/isihxweh6ffxxhhrw2fthqymfa * [ ] https://fatcat.wiki/release/he334wpbobegxhptpkvvrufioq https://fatcat.wiki/release/td3ouhgtzbbe7ctevfnldqkoba * [ ] https://fatcat.wiki/release/5zybwzmlsjexri6c3ma6tczf7q https://fatcat.wiki/release/35gerfmlirelfh3af6qug2oz4q diff --git a/tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy b/tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy new file mode 100644 index 0000000..b5ef121 --- /dev/null +++ b/tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy @@ -0,0 +1,23 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.3403/30276345u" + }, + "extra": { + "crossref": { + "type": "standard" + }, + "subtitle": [ + "Human contact with surfaces at moderate temperature" + ] + }, + "ident": "gcqdvvjiq5bphl7lpc4invi4vy", + "publisher": "BSI British Standards", + "refs": [], + "release_type": "standard", + "revision": "adb2642c-d4e2-4688-a926-ead47f938f1f", + "state": "active", + "title": "Ergonomics of the thermal environment. Methods for the assessment of human responses to contact with surfaces", + "work_id": "mr5lzipoavg7hfdg3gmsk7pdcq" +} diff --git a/tests/data/release/neznj5fb4nf3tdqnotnbe34b6e b/tests/data/release/neznj5fb4nf3tdqnotnbe34b6e new file mode 100644 index 0000000..f440351 --- /dev/null +++ b/tests/data/release/neznj5fb4nf3tdqnotnbe34b6e @@ -0,0 +1,23 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.3403/30276345" + }, + "extra": { + "crossref": { + "type": "standard" + }, + "subtitle": [ + "Human contact with surfaces at moderate temperature" + ] + }, + "ident": "neznj5fb4nf3tdqnotnbe34b6e", + "publisher": "BSI British Standards", + "refs": [], + "release_type": "standard", + "revision": "b964e8f3-490c-478d-8cb2-037c1d7ca315", + "state": "active", + "title": "Ergonomics of the thermal environment. Methods for the assessment of human responses to contact with surfaces", + "work_id": "3xz5b5yllnbh5mwiysfn3ur5e4" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 0c3dda2..741cf15 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -102,4 +102,5 @@ r63fa4mqkfb3leafypdwnnj7jq,usifdrmhdbfhlodgaqgzwzi4da,, bg4gzikycnfvtkfwl5qnxeywwa,fbdg4rdgw5halgkhr4qcsex25y,Status.EXACT, omjj75lv4rayvcqmgmicnzf5ye,xxfujnvafrazbjw7kvh7bhmuvy,, 63g4ukdxajcqhdytqla6du3t3u,rz72bzfevzeofdeb342c6z45qu,Status.DIFFERENT,Miss.CUSTOM_PREFIX_10_14288 -ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG, +ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,OK.CUSTOM_IEEE_ARXIV +neznj5fb4nf3tdqnotnbe34b6e,gcqdvvjiq5bphl7lpc4invi4vy,Status.STRONG,OK.CUSTOM_BSI_UNDATED -- cgit v1.2.3