diff options
-rw-r--r-- | fuzzycat/common.py | 1 | ||||
-rw-r--r-- | fuzzycat/verify.py | 22 | ||||
-rw-r--r-- | notes/2020_11_testruns.md | 2 | ||||
-rw-r--r-- | tests/data/release/5yixxzyl3vh4xd56lwcraowgty | 43 | ||||
-rw-r--r-- | tests/data/release/pobnow7sxfhnxhltgwpru5k7oi | 21 | ||||
-rw-r--r-- | tests/data/release/tm3gaiumkvb3xc7t3i6suna6u4 | 35 | ||||
-rw-r--r-- | tests/data/release/uplqxenmk5axjes6zokml6q73y | 26 | ||||
-rw-r--r-- | tests/data/release/yespzqkm2zed7n4vhjpkddap5e | 24 | ||||
-rw-r--r-- | tests/data/verify.csv | 3 |
9 files changed, 175 insertions, 2 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 5bf033c..20a5ddd 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -56,6 +56,7 @@ class Miss(str, Enum): DATASET_DOI = 'miss.dataset_doi' JSTOR_ID = 'miss.jstor_id' NUM_DIFF = 'miss.num_diff' + PAGE_COUNT = 'miss.page_count' RELEASE_TYPE = 'miss.release_type' SHARED_DOI_PREFIX = 'miss.shared_doi_prefix' SHORT_TITLE = 'miss.short_title' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 88e83d5..76571da 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -322,6 +322,8 @@ def compare(a, b): ]) if len(types & ignore_release_types) == 0: return (Status.DIFFERENT, Miss.RELEASE_TYPE) + if "dataset" in types and ("article" in types or "article-journal" in types): + return (Status.DIFFERENT, Miss.RELEASE_TYPE) except PathAccessError: pass @@ -543,4 +545,24 @@ def compare(a, b): except PathAccessError: pass + # If pages exists, but differ too much, bail out. + # https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4 + # https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve + try: + a_pages = glom(a, "pages") + b_pages = glom(b, "pages") + page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})") + a_match = page_pattern.match(a_pages) + b_match = page_pattern.match(b_pages) + if a_match and b_match: + a_start, a_end = a_match.groups() + b_start, b_end = b_match.groups() + a_num_pages = int(a_end) - int(a_start) + b_num_pages = int(b_end) - int(b_start) + if a_num_pages >= 0 and b_num_pages >= 0: + if abs(a_num_pages - b_num_pages) > 5: + return (Status.DIFFERENT, Miss.PAGE_COUNT) + except PathAccessError: + pass + return (Status.AMBIGUOUS, OK.DUMMY) diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md index cfa6c6f..51834fc 100644 --- a/notes/2020_11_testruns.md +++ b/notes/2020_11_testruns.md @@ -225,8 +225,6 @@ Hard to say (but seem to be a rerun of an article in a "similar" journal). Ok. * [ ] https://fatcat.wiki/release/hwnqyz7n65eabhlivvkipkytji https://fatcat.wiki/release/cwqujxztefdghhssb7ysxj7b5m Status.AMBIGUOUS OK.DUMMY - - * [ ] https://fatcat.wiki/release/yespzqkm2zed7n4vhjpkddap5e https://fatcat.wiki/release/5yixxzyl3vh4xd56lwcraowgty Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/47opwjqugjecjmiqgukahw6p2m https://fatcat.wiki/release/real7tmfxjan7j3fgkilt7fze4 Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/gaf7gjwetrbpzntrp4bt4nxaiy https://fatcat.wiki/release/htsa3mrirndbdjtdangr4mzrdu Status.AMBIGUOUS OK.DUMMY diff --git a/tests/data/release/5yixxzyl3vh4xd56lwcraowgty b/tests/data/release/5yixxzyl3vh4xd56lwcraowgty new file mode 100644 index 0000000..3fb8fb4 --- /dev/null +++ b/tests/data/release/5yixxzyl3vh4xd56lwcraowgty @@ -0,0 +1,43 @@ +{ + "abstracts": [], + "container_id": "cjmzhtkkdjgjphz5zuzahenz2a", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "WALTER S. ROOT", + "role": "author" + }, + { + "index": 1, + "raw_name": "FREDERICK G. HOFMANN", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1097/00000542-196701000-00100" + }, + "extra": { + "crossref": { + "type": "journal-article" + }, + "subtitle": [ + "" + ] + }, + "ident": "5yixxzyl3vh4xd56lwcraowgty", + "language": "en", + "pages": "284", + "publisher": "Ovid Technologies (Wolters Kluwer Health)", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1967, + "revision": "ec95ac81-64ec-41c0-af9e-845f0dd6cbbe", + "state": "active", + "title": "Physiological Pharmacology. A Comprehensive Treatise", + "volume": "28", + "work_id": "gznfk7fm4jdwzkbudxyo3kemfu" +} diff --git a/tests/data/release/pobnow7sxfhnxhltgwpru5k7oi b/tests/data/release/pobnow7sxfhnxhltgwpru5k7oi new file mode 100644 index 0000000..d5a5ed8 --- /dev/null +++ b/tests/data/release/pobnow7sxfhnxhltgwpru5k7oi @@ -0,0 +1,21 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.1163/2352-0248_edn_a2481000" + }, + "extra": { + "container_name": "Enzyklopädie der Neuzeit Online", + "crossref": { + "type": "dataset" + } + }, + "ident": "pobnow7sxfhnxhltgwpru5k7oi", + "publisher": "Brill Academic Publishers", + "refs": [], + "release_type": "dataset", + "revision": "8266c500-5b31-47e5-94dd-d04d90f5fe45", + "state": "active", + "title": "Leitbild", + "work_id": "bzxfao6pe5hmvar75ebglzyl3i" +} diff --git a/tests/data/release/tm3gaiumkvb3xc7t3i6suna6u4 b/tests/data/release/tm3gaiumkvb3xc7t3i6suna6u4 new file mode 100644 index 0000000..4f067cd --- /dev/null +++ b/tests/data/release/tm3gaiumkvb3xc7t3i6suna6u4 @@ -0,0 +1,35 @@ +{ + "abstracts": [], + "container_id": "qcsreng6drgiflvgxfo6tbc7cq", + "contribs": [], + "ext_ids": { + "doi": "10.1111/rsr.14216" + }, + "extra": { + "crossref": { + "alternative-id": [ + "10.1111/rsr.14216" + ], + "archive": [ + "Portico" + ], + "subject": [ + "Religious studies" + ], + "type": "journal-article" + } + }, + "ident": "tm3gaiumkvb3xc7t3i6suna6u4", + "language": "en", + "pages": "386-386", + "publisher": "Wiley", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2019, + "revision": "5c4d2543-33a3-4b87-9d20-e49786fd94f7", + "state": "active", + "title": "JEWISH THOUGHT", + "volume": "45", + "work_id": "ziwjsjtaxjentaednk52ht5eui" +} diff --git a/tests/data/release/uplqxenmk5axjes6zokml6q73y b/tests/data/release/uplqxenmk5axjes6zokml6q73y new file mode 100644 index 0000000..e4d580c --- /dev/null +++ b/tests/data/release/uplqxenmk5axjes6zokml6q73y @@ -0,0 +1,26 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.5169/seals-463775" + }, + "extra": { + "datacite": { + "metadataVersion": 2, + "resourceType": "Journal Article", + "resourceTypeGeneral": "Text" + }, + "release_month": 5 + }, + "ident": "uplqxenmk5axjes6zokml6q73y", + "publisher": "E. Löpfe-Benz", + "refs": [], + "release_date": "1931-05-15", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1931, + "revision": "e3572a08-2c3e-4231-8548-6427b11752c3", + "state": "active", + "title": "Leitbild", + "work_id": "fcu63qnsmbfmbc53x5jrhh3lma" +} diff --git a/tests/data/release/yespzqkm2zed7n4vhjpkddap5e b/tests/data/release/yespzqkm2zed7n4vhjpkddap5e new file mode 100644 index 0000000..28316be --- /dev/null +++ b/tests/data/release/yespzqkm2zed7n4vhjpkddap5e @@ -0,0 +1,24 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.1016/b978-1-4832-2760-3.50001-0" + }, + "extra": { + "container_name": "The Nervous System", + "crossref": { + "type": "book-chapter" + } + }, + "ident": "yespzqkm2zed7n4vhjpkddap5e", + "pages": "ii", + "publisher": "Elsevier", + "refs": [], + "release_stage": "published", + "release_type": "chapter", + "release_year": 1967, + "revision": "9322dda3-bfde-4eeb-8185-224b11b4eb85", + "state": "active", + "title": "PHYSIOLOGICAL PHARMACOLOGY: A Comprehensive Treatise", + "work_id": "rgluu7ppjrfaznbt254xkn4nou" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 37f4b6e..03fa947 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -143,3 +143,6 @@ f5ebjc63j5dzpct5hsme5j3ote,zeoquc2f4nbmdbmbcbkmkxmtzi,Status.AMBIGUOUS, zvsffdeufjb5dbchww7ydqdq3a,5rcu6myqx5ezjjytzpvsauyut4,Status.STRONG,OK.PMID_DOI_PAIR cd5aik2whrd5jlvleyvdq6iwja,kfttghqcsbddvofqd7l4bhtavy,Status.DIFFERENT,Miss.COMPONENT hwnqyz7n65eabhlivvkipkytji,cwqujxztefdghhssb7ysxj7b5m,Status.STRONG,OK.VERSIONED_DOI +yespzqkm2zed7n4vhjpkddap5e,5yixxzyl3vh4xd56lwcraowgty,Status.AMBIGUOUS, +pobnow7sxfhnxhltgwpru5k7oi,uplqxenmk5axjes6zokml6q73y,Status.DIFFERENT,Miss.RELEASE_TYPE +tm3gaiumkvb3xc7t3i6suna6u4,pobnow7sxfhnxhltgwpru5k7oi,Status.DIFFERENT,Miss.RELEASE_TYPE |