diff options
-rw-r--r-- | fuzzycat/common.py | 3 | ||||
-rw-r--r-- | fuzzycat/verify.py | 14 | ||||
-rw-r--r-- | notes/todo.md | 11 | ||||
-rw-r--r-- | tests/data/release/2omou6ehgjccbe6yjvr4wgnsha | 28 | ||||
-rw-r--r-- | tests/data/release/5b3lb2ebmrdp5nzxvohefmadre | 31 | ||||
-rw-r--r-- | tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu | 28 | ||||
-rw-r--r-- | tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy | 27 | ||||
-rw-r--r-- | tests/data/release/yy2wzuaxhba7jht72mcjhxuaju | 23 | ||||
-rw-r--r-- | tests/data/release/zkqujozrx5cnjitmglclt6heqq | 27 | ||||
-rw-r--r-- | tests/data/verify.csv | 3 |
10 files changed, 191 insertions, 4 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 8ebc43e..60f42ab 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -43,6 +43,8 @@ class Miss(str, Enum): COMPONENT = 'miss.component' CONTAINER = 'miss.container' CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' + CUSTOM_IOP_MA_PATTERN = 'miss.custom_iop_ma_pattern' + CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288' CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla DATASET_DOI = 'miss.dataset_doi' NUM_DIFF = 'miss.num_diff' @@ -51,4 +53,3 @@ class Miss(str, Enum): SUBTITLE = 'miss.subtitle' TITLE_FILENAME = 'miss.title_filename' YEAR = 'miss.year' - CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index f44d9db..993b7c9 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -159,12 +159,10 @@ def compare(a, b): if fragment in a_title_lower: return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT) - # https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi, # https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi if "subject index" in a_title_lower and "subject index" in b_title_lower: try: - print(a, b) if glom(a, "container_id") != glom(b, "container_id"): return (Status.DIFFERENT, Miss.CONTAINER) except PathAccessError: @@ -198,6 +196,16 @@ def compare(a, b): except PathAccessError: pass + try: + a_doi = glom(a, "ext_ids.doi") + b_doi = glom(b, "ext_ids.doi") + if has_doi_prefix(a_doi, "10.1149") and has_doi_prefix(b_doi, "10.1149"): + if (a_doi.startswith("10.1149/ma") and not b_doi.startswith("10.1149/ma") + or b_doi.startswith("10.1149/ma") and not a_doi.startswith("10.1149/ma")): + return (Status.DIFFERENT, Miss.CUSTOM_IOP_MA_PATTERN) + except PathAccessError: + pass + if "Zweckverband Volkshochschule " in a_title and a_title != b_title: return (Status.DIFFERENT, Miss.CUSTOM_VHS) @@ -426,6 +434,8 @@ TITLE_FRAGMENT_BLACKLIST = set([ "nouvelles du corps médical", "student government minutes:", "untersuchung einzelner abdominaler regionen und organe", + "annual general meeting", + "records of meetings", ]) CONTAINER_NAME_BLACKLIST = set([ diff --git a/notes/todo.md b/notes/todo.md index 49ce5d4..4231363 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -67,11 +67,20 @@ STKE "fulltext" link does not lead anywhere; discontinued. * [ ] https://fatcat.wiki/release/6udxu4cnk5egrcxtfrrqt3jcli https://fatcat.wiki/release/ett4oyembjfahhe3iwoc44dnja Status.AMBIGUOUS OK.DUMMY -> distinguish by page +> todo: distinguish by page * [ ] https://fatcat.wiki/release/ehu6pdvzvvcmdoyq4l2yf4vciu https://fatcat.wiki/release/2omou6ehgjccbe6yjvr4wgnsha Status.AMBIGUOUS OK.DUMMY + +Blacklist fragment. + * [ ] https://fatcat.wiki/release/zkqujozrx5cnjitmglclt6heqq https://fatcat.wiki/release/urr2gs4dsbbwdl7asgyqnwwtxy Status.AMBIGUOUS OK.DUMMY + +Blacklist fragment. + * [ ] https://fatcat.wiki/release/yy2wzuaxhba7jht72mcjhxuaju https://fatcat.wiki/release/5b3lb2ebmrdp5nzxvohefmadre Status.AMBIGUOUS OK.DUMMY + +> Meeting abstract (ma) versus document. + * [ ] https://fatcat.wiki/release/iwtrxnov2repzlgoi2at2md6tm https://fatcat.wiki/release/s5hm65waingwjmgf3plu76hzu4 Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/b6wfpvotwrecdbygyn27kmihne https://fatcat.wiki/release/3vflegbxtrg4fknx4zyq3rf4im Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/zlywxoy7cfexvaatziqp4ip5m4 https://fatcat.wiki/release/phqelg6oc5hs5dehhgmodcnh5u Status.AMBIGUOUS OK.DUMMY diff --git a/tests/data/release/2omou6ehgjccbe6yjvr4wgnsha b/tests/data/release/2omou6ehgjccbe6yjvr4wgnsha new file mode 100644 index 0000000..e72472f --- /dev/null +++ b/tests/data/release/2omou6ehgjccbe6yjvr4wgnsha @@ -0,0 +1,28 @@ +{ + "abstracts": [], + "container_id": "2y4b4m3v6rdepehsbnvon2yi5a", + "contribs": [], + "ext_ids": { + "doi": "10.1080/00379816009513763" + }, + "extra": { + "crossref": { + "alternative-id": [ + "10.1080/00379816009513763" + ], + "type": "journal-article" + } + }, + "ident": "2omou6ehgjccbe6yjvr4wgnsha", + "language": "en", + "pages": "329-331", + "publisher": "Informa UK Limited", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1963, + "revision": "cec3ccfc-354f-4344-9643-c49e9407ff60", + "state": "active", + "title": "The society's chronicle: Annual general meeting", + "work_id": "2i3c2hrtszeilhpjpepi5eis3u" +} diff --git a/tests/data/release/5b3lb2ebmrdp5nzxvohefmadre b/tests/data/release/5b3lb2ebmrdp5nzxvohefmadre new file mode 100644 index 0000000..6fd2ad0 --- /dev/null +++ b/tests/data/release/5b3lb2ebmrdp5nzxvohefmadre @@ -0,0 +1,31 @@ +{ + "abstracts": [], + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Steven J. Suess", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1149/1.2408933" + }, + "extra": { + "container_name": "ECS Transactions", + "crossref": { + "type": "proceedings-article" + } + }, + "ident": "5b3lb2ebmrdp5nzxvohefmadre", + "publisher": "ECS", + "refs": [], + "release_type": "paper-conference", + "release_year": 2007, + "revision": "b852e6ef-2b3c-4a78-aad8-168c086ae1ab", + "state": "active", + "title": "Analysis of Discolored and Corroded Components", + "work_id": "ham5odne4jefbfcx4zsboiuzvu" +} diff --git a/tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu b/tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu new file mode 100644 index 0000000..ee9d455 --- /dev/null +++ b/tests/data/release/ehu6pdvzvvcmdoyq4l2yf4vciu @@ -0,0 +1,28 @@ +{ + "abstracts": [], + "container_id": "2y4b4m3v6rdepehsbnvon2yi5a", + "contribs": [], + "ext_ids": { + "doi": "10.1080/00379816009513736" + }, + "extra": { + "crossref": { + "alternative-id": [ + "10.1080/00379816009513736" + ], + "type": "journal-article" + } + }, + "ident": "ehu6pdvzvvcmdoyq4l2yf4vciu", + "language": "en", + "pages": "227-229", + "publisher": "Informa UK Limited", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1962, + "revision": "0ffa280d-dec2-45e9-bea4-10114a1f46d2", + "state": "active", + "title": "The Society's chronicle: Annual general meeting", + "work_id": "k572bwhqivaspk3wydfn322vuy" +} diff --git a/tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy b/tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy new file mode 100644 index 0000000..c8eb3ea --- /dev/null +++ b/tests/data/release/urr2gs4dsbbwdl7asgyqnwwtxy @@ -0,0 +1,27 @@ +{ + "abstracts": [], + "container_id": "jq2tcuaoe5h6lkbunyw6ijhovi", + "contribs": [], + "ext_ids": { + "doi": "10.1144/pygs.14.3.513" + }, + "extra": { + "crossref": { + "type": "journal-article" + } + }, + "ident": "urr2gs4dsbbwdl7asgyqnwwtxy", + "language": "en", + "pages": "513-524", + "publisher": "Geological Society of London", + "refs": [], + "release_date": "1902-01-01", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1902, + "revision": "3d406b3d-2065-4ffe-b953-0d06bfd377c8", + "state": "active", + "title": "Records of Meetings", + "volume": "14", + "work_id": "kxkil2n2vrdd5b7hcm565ekryi" +} diff --git a/tests/data/release/yy2wzuaxhba7jht72mcjhxuaju b/tests/data/release/yy2wzuaxhba7jht72mcjhxuaju new file mode 100644 index 0000000..ded3bd3 --- /dev/null +++ b/tests/data/release/yy2wzuaxhba7jht72mcjhxuaju @@ -0,0 +1,23 @@ +{ + "abstracts": [], + "container_id": "rb7ftxiprfchdbbjnx5kpywhay", + "contribs": [], + "ext_ids": { + "doi": "10.1149/ma2006-01/6/327" + }, + "extra": { + "crossref": { + "type": "journal-article" + } + }, + "ident": "yy2wzuaxhba7jht72mcjhxuaju", + "publisher": "The Electrochemical Society", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2006, + "revision": "fd3fd3dc-b506-41d1-bbae-41aef34335d3", + "state": "active", + "title": "Analysis of Discolored and Corroded Components", + "work_id": "vfe7uctv7zelvohb3u7xuwsgpy" +} diff --git a/tests/data/release/zkqujozrx5cnjitmglclt6heqq b/tests/data/release/zkqujozrx5cnjitmglclt6heqq new file mode 100644 index 0000000..0083bc3 --- /dev/null +++ b/tests/data/release/zkqujozrx5cnjitmglclt6heqq @@ -0,0 +1,27 @@ +{ + "abstracts": [], + "container_id": "jq2tcuaoe5h6lkbunyw6ijhovi", + "contribs": [], + "ext_ids": { + "doi": "10.1144/pygs.15.3.483" + }, + "extra": { + "crossref": { + "type": "journal-article" + } + }, + "ident": "zkqujozrx5cnjitmglclt6heqq", + "language": "en", + "pages": "483-493", + "publisher": "Geological Society of London", + "refs": [], + "release_date": "1905-01-01", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1905, + "revision": "4e15035c-a8b5-43f3-9500-925bd6a9e042", + "state": "active", + "title": "RECORDS OF MEETINGS", + "volume": "15", + "work_id": "ydqm2e7jsndexkhpavlyqabram" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index b3054e0..9b449ea 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -106,3 +106,6 @@ ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,OK.CUSTOM_IE neznj5fb4nf3tdqnotnbe34b6e,gcqdvvjiq5bphl7lpc4invi4vy,Status.STRONG,OK.CUSTOM_BSI_UNDATED he334wpbobegxhptpkvvrufioq,td3ouhgtzbbe7ctevfnldqkoba,Status.EXACT,OK.DATACITE_VERSION caxa7qbfqvg3bkgz4nwvapgnvi,rnso2swxzvfonemgzrth3arumi,Status.DIFFERENT,Miss.CONTAINER +ehu6pdvzvvcmdoyq4l2yf4vciu,2omou6ehgjccbe6yjvr4wgnsha,Status.AMBIGUOUS,Miss.BLACKLISTED_FRAGMENT +zkqujozrx5cnjitmglclt6heqq,urr2gs4dsbbwdl7asgyqnwwtxy,Status.AMBIGUOUS,Miss.BLACKLISTED_FRAGMENT +5b3lb2ebmrdp5nzxvohefmadre,yy2wzuaxhba7jht72mcjhxuaju,Status.DIFFERENT,Miss.CUSTOM_IOP_MA_PATTERN |