From 4b5d71b8ca085c102fff1cf6629fa58582873d27 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 9 Dec 2020 01:42:16 +0100 Subject: add another case --- Makefile | 2 +- fuzzycat/common.py | 1 + fuzzycat/verify.py | 14 +++++++++++++ notes/2020_11_testruns.md | 6 ++++-- tests/data/release/idpgijvcsnbqrgs2dg36vzzdzm | 29 +++++++++++++++++++++++++++ tests/data/release/wm2p5fznwffknjx56lvmr7hn4q | 29 +++++++++++++++++++++++++++ tests/data/verify.csv | 3 ++- 7 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 tests/data/release/idpgijvcsnbqrgs2dg36vzzdzm create mode 100644 tests/data/release/wm2p5fznwffknjx56lvmr7hn4q diff --git a/Makefile b/Makefile index a0e2946..9354f76 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ cov: ## Run coverage report .PHONY: test test: ## Run coverage report - pytest -o log_cli=true -v fuzzycat/*.py tests/*.py + pytest -o log_cli=true -s -vvv fuzzycat/*.py tests/*.py .PHONY: lint lint: $(PY_FILES) diff --git a/fuzzycat/common.py b/fuzzycat/common.py index d87bef4..977db2c 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -57,3 +57,4 @@ class Miss(str, Enum): SUBTITLE = 'miss.subtitle' TITLE_FILENAME = 'miss.title_filename' YEAR = 'miss.year' + SHARED_DOI_PREFIX = 'miss.shared_doi_prefix' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index e945a15..dd2ced8 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -437,6 +437,20 @@ def compare(a, b): (dict_key_exists(b, "ext_ids.pmid") and not dict_key_exists(b, "ext_ids.doi"))): return (Status.STRONG, OK.PMID_DOI_PAIR) + # Publication from same publisher and different DOI or year a probably + # different. + try: + a_container_id = glom(a, "container_id") + b_container_id = glom(b, "container_id") + a_doi = glom(a, "ext_ids.doi") + b_doi = glom(b, "ext_ids.doi") + + if a_container_id == b_container_id and a_doi != b_doi and not has_doi_prefix( + a_doi, "10.1126"): + return (Status.DIFFERENT, Miss.SHARED_DOI_PREFIX) + except PathAccessError: + pass + if a_authors and len(a_slug_authors & b_slug_authors) == 0: # Before we bail out, run an authors similarity check. TODO: This is # not the right place, but lives here now, since these cases popped up diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md index edc39d0..6dd2ec8 100644 --- a/notes/2020_11_testruns.md +++ b/notes/2020_11_testruns.md @@ -133,10 +133,12 @@ Defer. * [ ] https://fatcat.wiki/release/7x7tszf54zggvp4xkrhakp667u https://fatcat.wiki/release/eqcgtpav3na5jh56o5vjsvb4ei Status.AMBIGUOUS OK.DUMMY -Same, pubmed id only and oxfordjournals doi, same year. - +Same, pubmed id only and oxfordjournals doi, same year. New: `PMID_DOI_PAIR` * [ ] https://fatcat.wiki/release/idpgijvcsnbqrgs2dg36vzzdzm https://fatcat.wiki/release/wm2p5fznwffknjx56lvmr7hn4q Status.AMBIGUOUS OK.DUMMY + + + * [ ] https://fatcat.wiki/release/nqcfu4il45aixekvk3rwflahdm https://fatcat.wiki/release/72uzveph65ce7kfdct2wpgh5j4 Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/zizw6bgxu5cnxfx5h3v7q7gute https://fatcat.wiki/release/jwh6xci4m5dktmea6bphhc3mjy Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/b7bbygyawzdsthai7j7rmztrxe https://fatcat.wiki/release/mvvbim7kdffvtosuldtv5m3uy4 Status.AMBIGUOUS OK.DUMMY diff --git a/tests/data/release/idpgijvcsnbqrgs2dg36vzzdzm b/tests/data/release/idpgijvcsnbqrgs2dg36vzzdzm new file mode 100644 index 0000000..d3be763 --- /dev/null +++ b/tests/data/release/idpgijvcsnbqrgs2dg36vzzdzm @@ -0,0 +1,29 @@ +{ + "abstracts": [], + "container_id": "ytowgmaklbbvlgtjcokeyrawb4", + "contribs": [], + "ext_ids": { + "doi": "10.1016/s0026-0576(00)83224-9" + }, + "extra": { + "crossref": { + "alternative-id": [ + "S0026057600832249" + ], + "type": "journal-article" + } + }, + "ident": "idpgijvcsnbqrgs2dg36vzzdzm", + "language": "en", + "pages": "33", + "publisher": "Elsevier BV", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2000, + "revision": "150ceb07-2033-4488-8144-d14bf2241c84", + "state": "active", + "title": "Maxi-Blast Inc.", + "volume": "98", + "work_id": "xj5uedooxza3xjftmf5rurkvoi" +} diff --git a/tests/data/release/wm2p5fznwffknjx56lvmr7hn4q b/tests/data/release/wm2p5fznwffknjx56lvmr7hn4q new file mode 100644 index 0000000..f06160e --- /dev/null +++ b/tests/data/release/wm2p5fznwffknjx56lvmr7hn4q @@ -0,0 +1,29 @@ +{ + "abstracts": [], + "container_id": "ytowgmaklbbvlgtjcokeyrawb4", + "contribs": [], + "ext_ids": { + "doi": "10.1016/s0026-0576(01)80334-2" + }, + "extra": { + "crossref": { + "alternative-id": [ + "S0026057601803342" + ], + "type": "journal-article" + } + }, + "ident": "wm2p5fznwffknjx56lvmr7hn4q", + "language": "en", + "pages": "36", + "publisher": "Elsevier BV", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1999, + "revision": "805d5835-bcb3-439a-916f-0e9996fbad87", + "state": "active", + "title": "Maxi-Blast Inc.", + "volume": "97", + "work_id": "frfzf2g6srcs3b7wcdp6wgamzi" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index e03eec4..f631cf4 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -94,7 +94,7 @@ fuaz2iolhjegfpdmob3i3efvgm,uxzn4nznrfbttivwzdc7noptku,Status.EXACT,OK.TITLE_AUTH 7j2dsplr45bhvdtrhqa7hykwka,pxzy4k45xjhgfgw6znf5xjayfa,, 7j2dsplr45bhvdtrhqa7hykwka,ud3tzdfacncvnkj232lkvvg34q,Status.EXACT,OK.DOI pxzy4k45xjhgfgw6znf5xjayfa,ud3tzdfacncvnkj232lkvvg34q,, -b5p5i7phjfejhiecjaz4arkp3m,rzicki3gcjayxaic7ckyx6bcmq,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY +b5p5i7phjfejhiecjaz4arkp3m,rzicki3gcjayxaic7ckyx6bcmq,Status.DIFFERENT,Miss.SHARED_DOI_PREFIX bkonm3q2dbegde3i4sl6h6lkvq,hhyknlu55vdezg3xsqxa4as2eu,Status.EXACT,OK.TITLE_AUTHOR_MATCH bkonm3q2dbegde3i4sl6h6lkvq,s7mpon5havhx3eil563hcz6anu,Status.EXACT,OK.DOI hhyknlu55vdezg3xsqxa4as2eu,s7mpon5havhx3eil563hcz6anu,Status.EXACT,OK.TITLE_AUTHOR_MATCH @@ -128,3 +128,4 @@ mz6a32xbp5f67i2cnbco2hmzj4,fo5dsqeocfekfhqdzgqyng3z6q,Status.AMBIGUOUS,Miss.CUST g2swo5fewnhv3ihmlpl32sojr4,ab2q56gokfdmzpccrmwfcdljgy,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW tmlg73royrdwdhl6nijf6m7vzy,3w4tibll4rdernjrn4hkkyqsem,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY eqcgtpav3na5jh56o5vjsvb4ei,7x7tszf54zggvp4xkrhakp667u,Status.STRONG,OK.PMID_DOI_PAIR +wm2p5fznwffknjx56lvmr7hn4q,idpgijvcsnbqrgs2dg36vzzdzm,Status.DIFFERENT,Miss.SHARED_DOI_PREFIX -- cgit v1.2.3