diff options
-rw-r--r-- | fuzzycat/common.py | 1 | ||||
-rw-r--r-- | fuzzycat/verify.py | 12 | ||||
-rw-r--r-- | notes/todo.md | 30 | ||||
-rw-r--r-- | tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi | 38 | ||||
-rw-r--r-- | tests/data/release/rnso2swxzvfonemgzrth3arumi | 37 | ||||
-rw-r--r-- | tests/data/verify.csv | 1 |
6 files changed, 115 insertions, 4 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 5cef684..8ebc43e 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -41,6 +41,7 @@ class Miss(str, Enum): BOOK_CHAPTER = 'miss.book_chapter' CHEM_FORMULA = 'miss.chem_formula' COMPONENT = 'miss.component' + CONTAINER = 'miss.container' CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla DATASET_DOI = 'miss.dataset_doi' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 5977f8e..f44d9db 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -159,6 +159,17 @@ def compare(a, b): if fragment in a_title_lower: return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT) + + # https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi, + # https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi + if "subject index" in a_title_lower and "subject index" in b_title_lower: + try: + print(a, b) + if glom(a, "container_id") != glom(b, "container_id"): + return (Status.DIFFERENT, Miss.CONTAINER) + except PathAccessError: + pass + try: if a_title and a_title == b_title and glom(a, "extra.datacite.metadataVersion") != glom( b, "extra.datacite.metadataVersion"): @@ -296,7 +307,6 @@ def compare(a, b): # https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily def ieee_arxiv_pair_check(a, b): try: - print(a_slug_title, glom(a, "ext_ids.doi")) if (glom(a, "ext_ids.doi").split("/")[0] == "10.1109" and glom(b, "ext_ids.arxiv") != ""): return (Status.STRONG, OK.CUSTOM_IEEE_ARXIV) diff --git a/notes/todo.md b/notes/todo.md index e0479c3..49ce5d4 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -4,7 +4,7 @@ * 2805572 undecided items -Examples. +## Examples * [x] https://fatcat.wiki/release/73pcaauzwbalvi7aqhv6vopxl4 https://fatcat.wiki/release/xp3oxb7tqbgaxdzkzbchfkcjn4 @@ -49,6 +49,30 @@ STKE "fulltext" link does not lead anywhere; discontinued. > Interestingly, the same item, altough different doi and URL, but image ID seems to be the same. -* [ ] https://fatcat.wiki/release/he334wpbobegxhptpkvvrufioq https://fatcat.wiki/release/td3ouhgtzbbe7ctevfnldqkoba +* [x] https://fatcat.wiki/release/he334wpbobegxhptpkvvrufioq https://fatcat.wiki/release/td3ouhgtzbbe7ctevfnldqkoba + +> datacite version + * [ ] https://fatcat.wiki/release/5zybwzmlsjexri6c3ma6tczf7q https://fatcat.wiki/release/35gerfmlirelfh3af6qug2oz4q -* [ ] https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi +* [x] https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi + +> too common title + +* [ ] https://fatcat.wiki/release/tfhflmc2gnfrncsv2pm2b4oraq https://fatcat.wiki/release/gp7cnryj5bczhao6oor5sbjaoe Status.AMBIGUOUS OK.DUMMY + +> Two items, datacite, but both version 1; one lead to an inaccessible item + +* [ ] https://fatcat.wiki/release/s4kjrs3g5ndlvixz2fgpydeuja https://fatcat.wiki/release/jn25jn44vzbc3nsubabl2wndsa Status.AMBIGUOUS OK.DUMMY +* [ ] https://fatcat.wiki/release/5xbugnniynea3k3pllzrb4lfeu https://fatcat.wiki/release/e52xw23ec5cxzi6mkyfyxifvhu Status.AMBIGUOUS OK.DUMMY + +* [ ] https://fatcat.wiki/release/6udxu4cnk5egrcxtfrrqt3jcli https://fatcat.wiki/release/ett4oyembjfahhe3iwoc44dnja Status.AMBIGUOUS OK.DUMMY + +> distinguish by page + +* [ ] https://fatcat.wiki/release/ehu6pdvzvvcmdoyq4l2yf4vciu https://fatcat.wiki/release/2omou6ehgjccbe6yjvr4wgnsha Status.AMBIGUOUS OK.DUMMY +* [ ] https://fatcat.wiki/release/zkqujozrx5cnjitmglclt6heqq https://fatcat.wiki/release/urr2gs4dsbbwdl7asgyqnwwtxy Status.AMBIGUOUS OK.DUMMY +* [ ] https://fatcat.wiki/release/yy2wzuaxhba7jht72mcjhxuaju https://fatcat.wiki/release/5b3lb2ebmrdp5nzxvohefmadre Status.AMBIGUOUS OK.DUMMY +* [ ] https://fatcat.wiki/release/iwtrxnov2repzlgoi2at2md6tm https://fatcat.wiki/release/s5hm65waingwjmgf3plu76hzu4 Status.AMBIGUOUS OK.DUMMY +* [ ] https://fatcat.wiki/release/b6wfpvotwrecdbygyn27kmihne https://fatcat.wiki/release/3vflegbxtrg4fknx4zyq3rf4im Status.AMBIGUOUS OK.DUMMY +* [ ] https://fatcat.wiki/release/zlywxoy7cfexvaatziqp4ip5m4 https://fatcat.wiki/release/phqelg6oc5hs5dehhgmodcnh5u Status.AMBIGUOUS OK.DUMMY + diff --git a/tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi b/tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi new file mode 100644 index 0000000..128fb9a --- /dev/null +++ b/tests/data/release/caxa7qbfqvg3bkgz4nwvapgnvi @@ -0,0 +1,38 @@ +{ + "abstracts": [], + "container_id": "qxxnnmcqozaubjb7lvgpdwft3q", + "contribs": [], + "ext_ids": { + "doi": "10.1159/000054005" + }, + "extra": { + "crossref": { + "archive": [ + "Portico" + ], + "license": [ + { + "URL": "https://www.karger.com/Services/SiteLicenses", + "content-version": "vor", + "delay-in-days": 0, + "start": "2000-01-01T00:00:00Z" + } + ], + "type": "journal-article" + } + }, + "ident": "caxa7qbfqvg3bkgz4nwvapgnvi", + "issue": "4-6", + "language": "en", + "pages": "343-344", + "publisher": "S. Karger AG", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2000, + "revision": "76c66222-e00e-438b-a76e-9bb330747f1d", + "state": "active", + "title": "Subject Index Vol. 43, 2000", + "volume": "43", + "work_id": "nbx42io65bg4hbiiwesswgwizq" +} diff --git a/tests/data/release/rnso2swxzvfonemgzrth3arumi b/tests/data/release/rnso2swxzvfonemgzrth3arumi new file mode 100644 index 0000000..69cb214 --- /dev/null +++ b/tests/data/release/rnso2swxzvfonemgzrth3arumi @@ -0,0 +1,37 @@ +{ + "abstracts": [], + "container_id": "waqch6lx5raqbkrcmbicg62sdi", + "contribs": [], + "ext_ids": { + "doi": "10.1159/000008172" + }, + "extra": { + "crossref": { + "archive": [ + "Portico" + ], + "license": [ + { + "URL": "https://www.karger.com/Services/SiteLicenses", + "content-version": "vor", + "delay-in-days": 0, + "start": "2000-01-01T00:00:00Z" + } + ], + "type": "journal-article" + } + }, + "ident": "rnso2swxzvfonemgzrth3arumi", + "language": "en", + "pages": "250-250", + "publisher": "S. Karger AG", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2000, + "revision": "ee60371d-60c3-4114-9c05-6f32b0776bd3", + "state": "active", + "title": "Subject Index Vol. 43, 2000", + "volume": "43", + "work_id": "uxyz22zizfebrgyehhaefm6b4a" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 874baa0..b3054e0 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -105,3 +105,4 @@ omjj75lv4rayvcqmgmicnzf5ye,xxfujnvafrazbjw7kvh7bhmuvy,, ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,OK.CUSTOM_IEEE_ARXIV neznj5fb4nf3tdqnotnbe34b6e,gcqdvvjiq5bphl7lpc4invi4vy,Status.STRONG,OK.CUSTOM_BSI_UNDATED he334wpbobegxhptpkvvrufioq,td3ouhgtzbbe7ctevfnldqkoba,Status.EXACT,OK.DATACITE_VERSION +caxa7qbfqvg3bkgz4nwvapgnvi,rnso2swxzvfonemgzrth3arumi,Status.DIFFERENT,Miss.CONTAINER |