aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/common.py3
-rw-r--r--fuzzycat/verify.py14
-rw-r--r--notes/todo.md11
-rw-r--r--tests/data/release/2omou6ehgjccbe6yjvr4wgnsha28
-rw-r--r--tests/data/release/5b3lb2ebmrdp5nzxvohefmadre31
-rw-r--r--tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu28
-rw-r--r--tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy27
-rw-r--r--tests/data/release/yy2wzuaxhba7jht72mcjhxuaju23
-rw-r--r--tests/data/release/zkqujozrx5cnjitmglclt6heqq27
-rw-r--r--tests/data/verify.csv3
10 files changed, 191 insertions, 4 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index 8ebc43e..60f42ab 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -43,6 +43,8 @@ class Miss(str, Enum):
COMPONENT = 'miss.component'
CONTAINER = 'miss.container'
CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
+ CUSTOM_IOP_MA_PATTERN = 'miss.custom_iop_ma_pattern'
+ CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288'
CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
DATASET_DOI = 'miss.dataset_doi'
NUM_DIFF = 'miss.num_diff'
@@ -51,4 +53,3 @@ class Miss(str, Enum):
SUBTITLE = 'miss.subtitle'
TITLE_FILENAME = 'miss.title_filename'
YEAR = 'miss.year'
- CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288'
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index f44d9db..993b7c9 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -159,12 +159,10 @@ def compare(a, b):
if fragment in a_title_lower:
return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT)
-
# https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi,
# https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi
if "subject index" in a_title_lower and "subject index" in b_title_lower:
try:
- print(a, b)
if glom(a, "container_id") != glom(b, "container_id"):
return (Status.DIFFERENT, Miss.CONTAINER)
except PathAccessError:
@@ -198,6 +196,16 @@ def compare(a, b):
except PathAccessError:
pass
+ try:
+ a_doi = glom(a, "ext_ids.doi")
+ b_doi = glom(b, "ext_ids.doi")
+ if has_doi_prefix(a_doi, "10.1149") and has_doi_prefix(b_doi, "10.1149"):
+ if (a_doi.startswith("10.1149/ma") and not b_doi.startswith("10.1149/ma")
+ or b_doi.startswith("10.1149/ma") and not a_doi.startswith("10.1149/ma")):
+ return (Status.DIFFERENT, Miss.CUSTOM_IOP_MA_PATTERN)
+ except PathAccessError:
+ pass
+
if "Zweckverband Volkshochschule " in a_title and a_title != b_title:
return (Status.DIFFERENT, Miss.CUSTOM_VHS)
@@ -426,6 +434,8 @@ TITLE_FRAGMENT_BLACKLIST = set([
"nouvelles du corps médical",
"student government minutes:",
"untersuchung einzelner abdominaler regionen und organe",
+ "annual general meeting",
+ "records of meetings",
])
CONTAINER_NAME_BLACKLIST = set([
diff --git a/notes/todo.md b/notes/todo.md
index 49ce5d4..4231363 100644
--- a/notes/todo.md
+++ b/notes/todo.md
@@ -67,11 +67,20 @@ STKE "fulltext" link does not lead anywhere; discontinued.
* [ ] https://fatcat.wiki/release/6udxu4cnk5egrcxtfrrqt3jcli https://fatcat.wiki/release/ett4oyembjfahhe3iwoc44dnja Status.AMBIGUOUS OK.DUMMY
-> distinguish by page
+> todo: distinguish by page
* [ ] https://fatcat.wiki/release/ehu6pdvzvvcmdoyq4l2yf4vciu https://fatcat.wiki/release/2omou6ehgjccbe6yjvr4wgnsha Status.AMBIGUOUS OK.DUMMY
+
+Blacklist fragment.
+
* [ ] https://fatcat.wiki/release/zkqujozrx5cnjitmglclt6heqq https://fatcat.wiki/release/urr2gs4dsbbwdl7asgyqnwwtxy Status.AMBIGUOUS OK.DUMMY
+
+Blacklist fragment.
+
* [ ] https://fatcat.wiki/release/yy2wzuaxhba7jht72mcjhxuaju https://fatcat.wiki/release/5b3lb2ebmrdp5nzxvohefmadre Status.AMBIGUOUS OK.DUMMY
+
+> Meeting abstract (ma) versus document.
+
* [ ] https://fatcat.wiki/release/iwtrxnov2repzlgoi2at2md6tm https://fatcat.wiki/release/s5hm65waingwjmgf3plu76hzu4 Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/b6wfpvotwrecdbygyn27kmihne https://fatcat.wiki/release/3vflegbxtrg4fknx4zyq3rf4im Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/zlywxoy7cfexvaatziqp4ip5m4 https://fatcat.wiki/release/phqelg6oc5hs5dehhgmodcnh5u Status.AMBIGUOUS OK.DUMMY
diff --git a/tests/data/release/2omou6ehgjccbe6yjvr4wgnsha b/tests/data/release/2omou6ehgjccbe6yjvr4wgnsha
new file mode 100644
index 0000000..e72472f
--- /dev/null
+++ b/tests/data/release/2omou6ehgjccbe6yjvr4wgnsha
@@ -0,0 +1,28 @@
+{
+ "abstracts": [],
+ "container_id": "2y4b4m3v6rdepehsbnvon2yi5a",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1080/00379816009513763"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "10.1080/00379816009513763"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "2omou6ehgjccbe6yjvr4wgnsha",
+ "language": "en",
+ "pages": "329-331",
+ "publisher": "Informa UK Limited",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1963,
+ "revision": "cec3ccfc-354f-4344-9643-c49e9407ff60",
+ "state": "active",
+ "title": "The society's chronicle: Annual general meeting",
+ "work_id": "2i3c2hrtszeilhpjpepi5eis3u"
+}
diff --git a/tests/data/release/5b3lb2ebmrdp5nzxvohefmadre b/tests/data/release/5b3lb2ebmrdp5nzxvohefmadre
new file mode 100644
index 0000000..6fd2ad0
--- /dev/null
+++ b/tests/data/release/5b3lb2ebmrdp5nzxvohefmadre
@@ -0,0 +1,31 @@
+{
+ "abstracts": [],
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Steven J. Suess",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1149/1.2408933"
+ },
+ "extra": {
+ "container_name": "ECS Transactions",
+ "crossref": {
+ "type": "proceedings-article"
+ }
+ },
+ "ident": "5b3lb2ebmrdp5nzxvohefmadre",
+ "publisher": "ECS",
+ "refs": [],
+ "release_type": "paper-conference",
+ "release_year": 2007,
+ "revision": "b852e6ef-2b3c-4a78-aad8-168c086ae1ab",
+ "state": "active",
+ "title": "Analysis of Discolored and Corroded Components",
+ "work_id": "ham5odne4jefbfcx4zsboiuzvu"
+}
diff --git a/tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu b/tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu
new file mode 100644
index 0000000..ee9d455
--- /dev/null
+++ b/tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu
@@ -0,0 +1,28 @@
+{
+ "abstracts": [],
+ "container_id": "2y4b4m3v6rdepehsbnvon2yi5a",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1080/00379816009513736"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "10.1080/00379816009513736"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "ehu6pdvzvvcmdoyq4l2yf4vciu",
+ "language": "en",
+ "pages": "227-229",
+ "publisher": "Informa UK Limited",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1962,
+ "revision": "0ffa280d-dec2-45e9-bea4-10114a1f46d2",
+ "state": "active",
+ "title": "The Society's chronicle: Annual general meeting",
+ "work_id": "k572bwhqivaspk3wydfn322vuy"
+}
diff --git a/tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy b/tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy
new file mode 100644
index 0000000..c8eb3ea
--- /dev/null
+++ b/tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy
@@ -0,0 +1,27 @@
+{
+ "abstracts": [],
+ "container_id": "jq2tcuaoe5h6lkbunyw6ijhovi",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1144/pygs.14.3.513"
+ },
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ },
+ "ident": "urr2gs4dsbbwdl7asgyqnwwtxy",
+ "language": "en",
+ "pages": "513-524",
+ "publisher": "Geological Society of London",
+ "refs": [],
+ "release_date": "1902-01-01",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1902,
+ "revision": "3d406b3d-2065-4ffe-b953-0d06bfd377c8",
+ "state": "active",
+ "title": "Records of Meetings",
+ "volume": "14",
+ "work_id": "kxkil2n2vrdd5b7hcm565ekryi"
+}
diff --git a/tests/data/release/yy2wzuaxhba7jht72mcjhxuaju b/tests/data/release/yy2wzuaxhba7jht72mcjhxuaju
new file mode 100644
index 0000000..ded3bd3
--- /dev/null
+++ b/tests/data/release/yy2wzuaxhba7jht72mcjhxuaju
@@ -0,0 +1,23 @@
+{
+ "abstracts": [],
+ "container_id": "rb7ftxiprfchdbbjnx5kpywhay",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1149/ma2006-01/6/327"
+ },
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ },
+ "ident": "yy2wzuaxhba7jht72mcjhxuaju",
+ "publisher": "The Electrochemical Society",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2006,
+ "revision": "fd3fd3dc-b506-41d1-bbae-41aef34335d3",
+ "state": "active",
+ "title": "Analysis of Discolored and Corroded Components",
+ "work_id": "vfe7uctv7zelvohb3u7xuwsgpy"
+}
diff --git a/tests/data/release/zkqujozrx5cnjitmglclt6heqq b/tests/data/release/zkqujozrx5cnjitmglclt6heqq
new file mode 100644
index 0000000..0083bc3
--- /dev/null
+++ b/tests/data/release/zkqujozrx5cnjitmglclt6heqq
@@ -0,0 +1,27 @@
+{
+ "abstracts": [],
+ "container_id": "jq2tcuaoe5h6lkbunyw6ijhovi",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1144/pygs.15.3.483"
+ },
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ },
+ "ident": "zkqujozrx5cnjitmglclt6heqq",
+ "language": "en",
+ "pages": "483-493",
+ "publisher": "Geological Society of London",
+ "refs": [],
+ "release_date": "1905-01-01",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1905,
+ "revision": "4e15035c-a8b5-43f3-9500-925bd6a9e042",
+ "state": "active",
+ "title": "RECORDS OF MEETINGS",
+ "volume": "15",
+ "work_id": "ydqm2e7jsndexkhpavlyqabram"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index b3054e0..9b449ea 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -106,3 +106,6 @@ ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,OK.CUSTOM_IE
neznj5fb4nf3tdqnotnbe34b6e,gcqdvvjiq5bphl7lpc4invi4vy,Status.STRONG,OK.CUSTOM_BSI_UNDATED
he334wpbobegxhptpkvvrufioq,td3ouhgtzbbe7ctevfnldqkoba,Status.EXACT,OK.DATACITE_VERSION
caxa7qbfqvg3bkgz4nwvapgnvi,rnso2swxzvfonemgzrth3arumi,Status.DIFFERENT,Miss.CONTAINER
+ehu6pdvzvvcmdoyq4l2yf4vciu,2omou6ehgjccbe6yjvr4wgnsha,Status.AMBIGUOUS,Miss.BLACKLISTED_FRAGMENT
+zkqujozrx5cnjitmglclt6heqq,urr2gs4dsbbwdl7asgyqnwwtxy,Status.AMBIGUOUS,Miss.BLACKLISTED_FRAGMENT
+5b3lb2ebmrdp5nzxvohefmadre,yy2wzuaxhba7jht72mcjhxuaju,Status.DIFFERENT,Miss.CUSTOM_IOP_MA_PATTERN