aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/verify.py12
-rw-r--r--notes/todo.md30
-rw-r--r--tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi38
-rw-r--r--tests/data/release/rnso2swxzvfonemgzrth3arumi37
-rw-r--r--tests/data/verify.csv1
6 files changed, 115 insertions, 4 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index 5cef684..8ebc43e 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -41,6 +41,7 @@ class Miss(str, Enum):
BOOK_CHAPTER = 'miss.book_chapter'
CHEM_FORMULA = 'miss.chem_formula'
COMPONENT = 'miss.component'
+ CONTAINER = 'miss.container'
CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
DATASET_DOI = 'miss.dataset_doi'
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 5977f8e..f44d9db 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -159,6 +159,17 @@ def compare(a, b):
if fragment in a_title_lower:
return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT)
+
+ # https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi,
+ # https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi
+ if "subject index" in a_title_lower and "subject index" in b_title_lower:
+ try:
+ print(a, b)
+ if glom(a, "container_id") != glom(b, "container_id"):
+ return (Status.DIFFERENT, Miss.CONTAINER)
+ except PathAccessError:
+ pass
+
try:
if a_title and a_title == b_title and glom(a, "extra.datacite.metadataVersion") != glom(
b, "extra.datacite.metadataVersion"):
@@ -296,7 +307,6 @@ def compare(a, b):
# https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily
def ieee_arxiv_pair_check(a, b):
try:
- print(a_slug_title, glom(a, "ext_ids.doi"))
if (glom(a, "ext_ids.doi").split("/")[0] == "10.1109"
and glom(b, "ext_ids.arxiv") != ""):
return (Status.STRONG, OK.CUSTOM_IEEE_ARXIV)
diff --git a/notes/todo.md b/notes/todo.md
index e0479c3..49ce5d4 100644
--- a/notes/todo.md
+++ b/notes/todo.md
@@ -4,7 +4,7 @@
* 2805572 undecided items
-Examples.
+## Examples
* [x] https://fatcat.wiki/release/73pcaauzwbalvi7aqhv6vopxl4 https://fatcat.wiki/release/xp3oxb7tqbgaxdzkzbchfkcjn4
@@ -49,6 +49,30 @@ STKE "fulltext" link does not lead anywhere; discontinued.
> Interestingly, the same item, altough different doi and URL, but image ID seems to be the same.
-* [ ] https://fatcat.wiki/release/he334wpbobegxhptpkvvrufioq https://fatcat.wiki/release/td3ouhgtzbbe7ctevfnldqkoba
+* [x] https://fatcat.wiki/release/he334wpbobegxhptpkvvrufioq https://fatcat.wiki/release/td3ouhgtzbbe7ctevfnldqkoba
+
+> datacite version
+
* [ ] https://fatcat.wiki/release/5zybwzmlsjexri6c3ma6tczf7q https://fatcat.wiki/release/35gerfmlirelfh3af6qug2oz4q
-* [ ] https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi
+* [x] https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi
+
+> too common title
+
+* [ ] https://fatcat.wiki/release/tfhflmc2gnfrncsv2pm2b4oraq https://fatcat.wiki/release/gp7cnryj5bczhao6oor5sbjaoe Status.AMBIGUOUS OK.DUMMY
+
+> Two items, datacite, but both version 1; one lead to an inaccessible item
+
+* [ ] https://fatcat.wiki/release/s4kjrs3g5ndlvixz2fgpydeuja https://fatcat.wiki/release/jn25jn44vzbc3nsubabl2wndsa Status.AMBIGUOUS OK.DUMMY
+* [ ] https://fatcat.wiki/release/5xbugnniynea3k3pllzrb4lfeu https://fatcat.wiki/release/e52xw23ec5cxzi6mkyfyxifvhu Status.AMBIGUOUS OK.DUMMY
+
+* [ ] https://fatcat.wiki/release/6udxu4cnk5egrcxtfrrqt3jcli https://fatcat.wiki/release/ett4oyembjfahhe3iwoc44dnja Status.AMBIGUOUS OK.DUMMY
+
+> distinguish by page
+
+* [ ] https://fatcat.wiki/release/ehu6pdvzvvcmdoyq4l2yf4vciu https://fatcat.wiki/release/2omou6ehgjccbe6yjvr4wgnsha Status.AMBIGUOUS OK.DUMMY
+* [ ] https://fatcat.wiki/release/zkqujozrx5cnjitmglclt6heqq https://fatcat.wiki/release/urr2gs4dsbbwdl7asgyqnwwtxy Status.AMBIGUOUS OK.DUMMY
+* [ ] https://fatcat.wiki/release/yy2wzuaxhba7jht72mcjhxuaju https://fatcat.wiki/release/5b3lb2ebmrdp5nzxvohefmadre Status.AMBIGUOUS OK.DUMMY
+* [ ] https://fatcat.wiki/release/iwtrxnov2repzlgoi2at2md6tm https://fatcat.wiki/release/s5hm65waingwjmgf3plu76hzu4 Status.AMBIGUOUS OK.DUMMY
+* [ ] https://fatcat.wiki/release/b6wfpvotwrecdbygyn27kmihne https://fatcat.wiki/release/3vflegbxtrg4fknx4zyq3rf4im Status.AMBIGUOUS OK.DUMMY
+* [ ] https://fatcat.wiki/release/zlywxoy7cfexvaatziqp4ip5m4 https://fatcat.wiki/release/phqelg6oc5hs5dehhgmodcnh5u Status.AMBIGUOUS OK.DUMMY
+
diff --git a/tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi b/tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi
new file mode 100644
index 0000000..128fb9a
--- /dev/null
+++ b/tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi
@@ -0,0 +1,38 @@
+{
+ "abstracts": [],
+ "container_id": "qxxnnmcqozaubjb7lvgpdwft3q",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1159/000054005"
+ },
+ "extra": {
+ "crossref": {
+ "archive": [
+ "Portico"
+ ],
+ "license": [
+ {
+ "URL": "https://www.karger.com/Services/SiteLicenses",
+ "content-version": "vor",
+ "delay-in-days": 0,
+ "start": "2000-01-01T00:00:00Z"
+ }
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "caxa7qbfqvg3bkgz4nwvapgnvi",
+ "issue": "4-6",
+ "language": "en",
+ "pages": "343-344",
+ "publisher": "S. Karger AG",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2000,
+ "revision": "76c66222-e00e-438b-a76e-9bb330747f1d",
+ "state": "active",
+ "title": "Subject Index Vol. 43, 2000",
+ "volume": "43",
+ "work_id": "nbx42io65bg4hbiiwesswgwizq"
+}
diff --git a/tests/data/release/rnso2swxzvfonemgzrth3arumi b/tests/data/release/rnso2swxzvfonemgzrth3arumi
new file mode 100644
index 0000000..69cb214
--- /dev/null
+++ b/tests/data/release/rnso2swxzvfonemgzrth3arumi
@@ -0,0 +1,37 @@
+{
+ "abstracts": [],
+ "container_id": "waqch6lx5raqbkrcmbicg62sdi",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1159/000008172"
+ },
+ "extra": {
+ "crossref": {
+ "archive": [
+ "Portico"
+ ],
+ "license": [
+ {
+ "URL": "https://www.karger.com/Services/SiteLicenses",
+ "content-version": "vor",
+ "delay-in-days": 0,
+ "start": "2000-01-01T00:00:00Z"
+ }
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "rnso2swxzvfonemgzrth3arumi",
+ "language": "en",
+ "pages": "250-250",
+ "publisher": "S. Karger AG",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2000,
+ "revision": "ee60371d-60c3-4114-9c05-6f32b0776bd3",
+ "state": "active",
+ "title": "Subject Index Vol. 43, 2000",
+ "volume": "43",
+ "work_id": "uxyz22zizfebrgyehhaefm6b4a"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 874baa0..b3054e0 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -105,3 +105,4 @@ omjj75lv4rayvcqmgmicnzf5ye,xxfujnvafrazbjw7kvh7bhmuvy,,
ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,OK.CUSTOM_IEEE_ARXIV
neznj5fb4nf3tdqnotnbe34b6e,gcqdvvjiq5bphl7lpc4invi4vy,Status.STRONG,OK.CUSTOM_BSI_UNDATED
he334wpbobegxhptpkvvrufioq,td3ouhgtzbbe7ctevfnldqkoba,Status.EXACT,OK.DATACITE_VERSION
+caxa7qbfqvg3bkgz4nwvapgnvi,rnso2swxzvfonemgzrth3arumi,Status.DIFFERENT,Miss.CONTAINER