aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/utils.py9
-rw-r--r--fuzzycat/verify.py22
-rw-r--r--notes/todo.md6
-rw-r--r--tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy23
-rw-r--r--tests/data/release/neznj5fb4nf3tdqnotnbe34b6e23
-rw-r--r--tests/data/verify.csv3
7 files changed, 76 insertions, 11 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index 2298185..34508b7 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -26,6 +26,7 @@ class OK(str, Enum):
TITLE_AUTHOR_MATCH = 'ok.title_author_match'
TOKENIZED_AUTHORS = 'ok.tokenized_authors'
CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv'
+ CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated'
class Miss(str, Enum):
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index d6beb03..ef3b418 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -9,6 +9,15 @@ printable_no_punct = string.digits + string.ascii_letters + string.whitespace
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
+def has_doi_prefix(v, prefix="10.1234"):
+ """
+ Returns False, if we cannot parse v or prefix does not match.
+ """
+ if not v:
+ return False
+ return v.split("/")[0] == prefix
+
+
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 84e17d8..e688d49 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -75,8 +75,8 @@ import sys
from glom import PathAccessError, glom
from fuzzycat.common import OK, Miss, Status
-from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project,
- slugify_string)
+from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix,
+ num_project, slugify_string)
# The result of clustering are documents that have a key k and a list of values
# (of the cluster) v.
@@ -171,6 +171,15 @@ def compare(a, b):
except PathAccessError:
pass
+ try:
+ a_doi = glom(a, "ext_ids.doi")
+ b_doi = glom(b, "ext_ids.doi")
+ if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
+ if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
+ return (Status.STRONG, OK.CUSTOM_BSI_UNDATED)
+ except PathAccessError:
+ pass
+
if "Zweckverband Volkshochschule " in a_title and a_title != b_title:
return (Status.DIFFERENT, Miss.CUSTOM_VHS)
@@ -275,13 +284,6 @@ def compare(a, b):
a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
- try:
- if glom(a, "ext_ids.doi") == "10.1109/nssmic.2013.6829591":
- print(a_slug_title)
- print(b_slug_title)
- except PathAccessError:
- pass
-
if a_slug_title == b_slug_title:
# via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi
# https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily
@@ -294,6 +296,8 @@ def compare(a, b):
except PathAccessError:
pass
+ # TODO: we might want to have some light python DSL to express these
+ # (commute) things
result = ieee_arxiv_pair_check(a, b)
if result:
return result
diff --git a/notes/todo.md b/notes/todo.md
index aeaf6f0..107544d 100644
--- a/notes/todo.md
+++ b/notes/todo.md
@@ -36,11 +36,15 @@ STKE "fulltext" link does not lead anywhere; discontinued.
> book vs article-journal
-* [ ] https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily
+* [x] https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily
> preprint and IEEE published article
* [ ] https://fatcat.wiki/release/neznj5fb4nf3tdqnotnbe34b6e https://fatcat.wiki/release/gcqdvvjiq5bphl7lpc4invi4vy
+
+> a standard document; DOI and DOIu -- which means "undated" (as per URL) --
+> https://landingpage.bsigroup.com/LandingPage/Undated?UPI=000000000030281171
+
* [ ] https://fatcat.wiki/release/fmi7hmpb3beotnj5kfyjjkolcy https://fatcat.wiki/release/isihxweh6ffxxhhrw2fthqymfa
* [ ] https://fatcat.wiki/release/he334wpbobegxhptpkvvrufioq https://fatcat.wiki/release/td3ouhgtzbbe7ctevfnldqkoba
* [ ] https://fatcat.wiki/release/5zybwzmlsjexri6c3ma6tczf7q https://fatcat.wiki/release/35gerfmlirelfh3af6qug2oz4q
diff --git a/tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy b/tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy
new file mode 100644
index 0000000..b5ef121
--- /dev/null
+++ b/tests/data/release/gcqdvvjiq5bphl7lpc4invi4vy
@@ -0,0 +1,23 @@
+{
+ "abstracts": [],
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.3403/30276345u"
+ },
+ "extra": {
+ "crossref": {
+ "type": "standard"
+ },
+ "subtitle": [
+ "Human contact with surfaces at moderate temperature"
+ ]
+ },
+ "ident": "gcqdvvjiq5bphl7lpc4invi4vy",
+ "publisher": "BSI British Standards",
+ "refs": [],
+ "release_type": "standard",
+ "revision": "adb2642c-d4e2-4688-a926-ead47f938f1f",
+ "state": "active",
+ "title": "Ergonomics of the thermal environment. Methods for the assessment of human responses to contact with surfaces",
+ "work_id": "mr5lzipoavg7hfdg3gmsk7pdcq"
+}
diff --git a/tests/data/release/neznj5fb4nf3tdqnotnbe34b6e b/tests/data/release/neznj5fb4nf3tdqnotnbe34b6e
new file mode 100644
index 0000000..f440351
--- /dev/null
+++ b/tests/data/release/neznj5fb4nf3tdqnotnbe34b6e
@@ -0,0 +1,23 @@
+{
+ "abstracts": [],
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.3403/30276345"
+ },
+ "extra": {
+ "crossref": {
+ "type": "standard"
+ },
+ "subtitle": [
+ "Human contact with surfaces at moderate temperature"
+ ]
+ },
+ "ident": "neznj5fb4nf3tdqnotnbe34b6e",
+ "publisher": "BSI British Standards",
+ "refs": [],
+ "release_type": "standard",
+ "revision": "b964e8f3-490c-478d-8cb2-037c1d7ca315",
+ "state": "active",
+ "title": "Ergonomics of the thermal environment. Methods for the assessment of human responses to contact with surfaces",
+ "work_id": "3xz5b5yllnbh5mwiysfn3ur5e4"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 0c3dda2..741cf15 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -102,4 +102,5 @@ r63fa4mqkfb3leafypdwnnj7jq,usifdrmhdbfhlodgaqgzwzi4da,,
bg4gzikycnfvtkfwl5qnxeywwa,fbdg4rdgw5halgkhr4qcsex25y,Status.EXACT,
omjj75lv4rayvcqmgmicnzf5ye,xxfujnvafrazbjw7kvh7bhmuvy,,
63g4ukdxajcqhdytqla6du3t3u,rz72bzfevzeofdeb342c6z45qu,Status.DIFFERENT,Miss.CUSTOM_PREFIX_10_14288
-ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,
+ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,OK.CUSTOM_IEEE_ARXIV
+neznj5fb4nf3tdqnotnbe34b6e,gcqdvvjiq5bphl7lpc4invi4vy,Status.STRONG,OK.CUSTOM_BSI_UNDATED