aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/verify.py10
-rw-r--r--notes/2020_11_testruns.md5
-rw-r--r--tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy23
-rw-r--r--tests/data/release/tcro5wr6brhqnf5wettyiauw3420
-rw-r--r--tests/data/verify.csv1
-rw-r--r--tests/test_verify.py4
7 files changed, 62 insertions, 2 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index c50962b..2257e9d 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -17,6 +17,7 @@ class OK(str, Enum):
Reason for assuming we have a match.
"""
ARXIV_VERSION = 'ok.arxiv_version'
+ CUSTOM_BSI_SUBDOC = 'ok.custom_bsi_subdoc'
CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated'
CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv'
DATACITE_RELATED_ID = 'ok.datacite_related_id'
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 6fa4100..4daed97 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -76,7 +76,7 @@ from glom import PathAccessError, glom
from fuzzycat.common import OK, Miss, Status
from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix,
- num_project, slugify_string, jaccard)
+ jaccard, num_project, slugify_string)
# The result of clustering are documents that have a key k and a list of values
# (of the cluster) v.
@@ -211,6 +211,14 @@ def compare(a, b):
if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
return (Status.STRONG, OK.CUSTOM_BSI_UNDATED)
+ # Reference to subdocument.
+ # https://api.fatcat.wiki/v0/release/tcro5wr6brhqnf5wettyiauw34
+ # https://api.fatcat.wiki/v0/release/s7a4o5v5gfg4tbzna6poyg7nzy
+ if a_title == b_title and ((dict_key_exists(a, "extra.subtitle")
+ and not dict_key_exists(b, "extra.subtitle")) or
+ (dict_key_exists(b, "extra.subtitle")
+ and not dict_key_exists(a, "extra.subtitle"))):
+ return (Status.STRONG, OK.CUSTOM_BSI_SUBDOC)
except PathAccessError:
pass
diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md
index 10fa426..e801df3 100644
--- a/notes/2020_11_testruns.md
+++ b/notes/2020_11_testruns.md
@@ -179,7 +179,12 @@ Ambiguous.
* [ ] https://fatcat.wiki/release/b3uhit7b4vhvliocdzwxr7peyy https://fatcat.wiki/release/zwru5ugcsfcyzeuqlygfw46vwq Status.AMBIGUOUS OK.DUMMY
+A difficult prefix.
+
* [ ] https://fatcat.wiki/release/s7a4o5v5gfg4tbzna6poyg7nzy https://fatcat.wiki/release/tcro5wr6brhqnf5wettyiauw34 Status.AMBIGUOUS OK.DUMMY
+
+BSI, one is a subdocument of another. The subdocument has a subtitle. That's more is-part-of.
+
* [ ] https://fatcat.wiki/release/b3odcrpuwveqljszl7l4c4ah5e https://fatcat.wiki/release/j4drpogsbzfd7esim4um5me374 Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/4owywaqwwvhrthytdjbulmltue https://fatcat.wiki/release/cy2v4wpm7nfptexi6ybjylapee Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/5ba6m23tmvcwdcpq3a2rcof4du https://fatcat.wiki/release/wczxyrrmovf3te5ziep7bbdreu Status.AMBIGUOUS OK.DUMMY
diff --git a/tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy b/tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy
new file mode 100644
index 0000000..02169b3
--- /dev/null
+++ b/tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy
@@ -0,0 +1,23 @@
+{
+ "abstracts": [],
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.3403/02279101u"
+ },
+ "extra": {
+ "crossref": {
+ "type": "standard"
+ },
+ "subtitle": [
+ "Fixed pipette method"
+ ]
+ },
+ "ident": "s7a4o5v5gfg4tbzna6poyg7nzy",
+ "publisher": "BSI British Standards",
+ "refs": [],
+ "release_type": "standard",
+ "revision": "9ae088bf-5480-4a5b-b68d-356096d28d16",
+ "state": "active",
+ "title": "Determination of particle size distribution by gravitational liquid sedimentation methods",
+ "work_id": "chjktubyxbdpviowq6p4agfh5y"
+}
diff --git a/tests/data/release/tcro5wr6brhqnf5wettyiauw34 b/tests/data/release/tcro5wr6brhqnf5wettyiauw34
new file mode 100644
index 0000000..2247102
--- /dev/null
+++ b/tests/data/release/tcro5wr6brhqnf5wettyiauw34
@@ -0,0 +1,20 @@
+{
+ "abstracts": [],
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.3403/bsiso13317"
+ },
+ "extra": {
+ "crossref": {
+ "type": "standard"
+ }
+ },
+ "ident": "tcro5wr6brhqnf5wettyiauw34",
+ "publisher": "BSI British Standards",
+ "refs": [],
+ "release_type": "standard",
+ "revision": "23fae426-a22e-4226-8400-f085d7eb0c56",
+ "state": "active",
+ "title": "Determination of particle size distribution by gravitational liquid sedimentation methods",
+ "work_id": "hh6o7mm3ivblng4t2dhuhzocwm"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 389762c..d3c8dbe 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -137,3 +137,4 @@ voruupqxhvggfex4zlczcmjxxu,jg72qhdvmncfdfxg5l47hw3uba,Status.AMBIGUOUS,OK.DUMMY
6ysfa7ncx5fldmvmwvjgpf2i6e,yp3rs3xb5ra2riyx5xayrlqfum,Status.EXACT,OK.WORK_ID
arqtphat7fashokettncepu7xe,v6p7xct6kfgwtdbh57zfjqmuua,Status.AMBIGUOUS,
zwru5ugcsfcyzeuqlygfw46vwq,b3uhit7b4vhvliocdzwxr7peyy,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_7916
+s7a4o5v5gfg4tbzna6poyg7nzy,tcro5wr6brhqnf5wettyiauw34,Status.STRONG,OK.CUSTOM_BSI_SUBDOC
diff --git a/tests/test_verify.py b/tests/test_verify.py
index 5a60326..e3e05ad 100644
--- a/tests/test_verify.py
+++ b/tests/test_verify.py
@@ -38,7 +38,9 @@ def test_compare():
try:
a, b, expected_status, expected_reason = row
except ValueError as exc:
- pytest.fail("invalid test file, maybe too many (or few) commas in row {}? {}".format(i + 1, exc))
+ pytest.fail(
+ "invalid test file, maybe too many (or few) commas in row {}? {}".format(
+ i + 1, exc))
status, reason = compare(load_release_ident(a), load_release_ident(b))
if not expected_status or expected_status.lower() == "todo":
logger.warning(