diff options
-rw-r--r-- | fuzzycat/common.py | 1 | ||||
-rw-r--r-- | fuzzycat/verify.py | 10 | ||||
-rw-r--r-- | tests/data/release/2fok7qpz5nbofay762elgqklwa | 43 | ||||
-rw-r--r-- | tests/data/release/4mvqmzsmofckjdcy7zippajceu | 43 | ||||
-rw-r--r-- | tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy | 46 | ||||
-rw-r--r-- | tests/data/release/qjavgacwznftlancwzxldtjt6y | 43 | ||||
-rw-r--r-- | tests/data/verify.csv | 2 |
7 files changed, 188 insertions, 0 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 79e2b14..e736de6 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -58,6 +58,7 @@ class Reason(str, Enum): SINGULAR_CLUSTER = 'singular_cluster' SLUG_TITLE_AUTHOR_MATCH = 'slug_title_author_match' SUBTITLE = 'subtitle' + TITLE_ARTIFACT = 'title_artifact' TITLE_AUTHOR_MATCH = 'title_author_match' TITLE_FILENAME = 'title_filename' TOKENIZED_AUTHORS = 'tokenized_authors' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index ff4567b..a44154a 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -596,4 +596,14 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: except (ValueError, PathAccessError): pass + # A variant of translated titles, e.g. https://fatcat.wiki/release/search?q=%22A+nova+classifica%C3%A7%C3%A3o+dos+tumores+da+mama+%22 + try: + a_container_id = glom(a, "container_id") + b_container_id = glom(b, "container_id") + if a_authors == b_authors and a_container_id == b_container_id and a_release_year == b_release_year and a_title != b_title and ( + a_title in b_title or b_title in a_title): + return Verify(Status.STRONG, Reason.TITLE_ARTIFACT) + except PathAccessError: + pass + return Verify(Status.AMBIGUOUS, Reason.UNKNOWN) diff --git a/tests/data/release/2fok7qpz5nbofay762elgqklwa b/tests/data/release/2fok7qpz5nbofay762elgqklwa new file mode 100644 index 0000000..58a1918 --- /dev/null +++ b/tests/data/release/2fok7qpz5nbofay762elgqklwa @@ -0,0 +1,43 @@ +{ + "abstracts": [], + "container_id": "oyuolsoctnezfhswyufqrsbjou", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "José Eymard H. Pittella", + "role": "author" + }, + { + "index": 1, + "raw_name": "Alfredo J. A. Barbosa", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1590/s1676-24442012000600002" + }, + "extra": { + "crossref": { + "alternative-id": [ + "S1676-24442012000600002" + ], + "type": "journal-article" + } + }, + "ident": "2fok7qpz5nbofay762elgqklwa", + "language": "en", + "pages": "406-407", + "publisher": "FapUNIFESP (SciELO)", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2012, + "revision": "3c528b1e-e308-439a-9c68-11436b028750", + "state": "active", + "title": "A nova classificação dos tumores da mama da OMS", + "volume": "48", + "work_id": "lxey46fwm5csnote5v3owqi7eq" +} diff --git a/tests/data/release/4mvqmzsmofckjdcy7zippajceu b/tests/data/release/4mvqmzsmofckjdcy7zippajceu new file mode 100644 index 0000000..d19b988 --- /dev/null +++ b/tests/data/release/4mvqmzsmofckjdcy7zippajceu @@ -0,0 +1,43 @@ +{ + "abstracts": [], + "container_id": "5idg4ohjqfggjd2vdgxino7424", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "W. C. B.", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1093/nq/s7-i.25.498e" + }, + "extra": { + "crossref": { + "subject": [ + "Linguistics and Language", + "Literature and Literary Theory", + "Library and Information Sciences", + "Language and Linguistics" + ], + "type": "journal-article" + } + }, + "ident": "4mvqmzsmofckjdcy7zippajceu", + "issue": "25", + "language": "en", + "pages": "498-498", + "publisher": "Oxford University Press (OUP)", + "refs": [], + "release_date": "1886-06-19", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1886, + "revision": "ad0594c6-8196-420c-b907-c2c534d1f059", + "state": "active", + "title": "\"Deux oreilles\"", + "volume": "s7-I", + "work_id": "5ord7xs65ra3fmec2e65l4w5c4" +} diff --git a/tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy b/tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy new file mode 100644 index 0000000..c78d1f4 --- /dev/null +++ b/tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy @@ -0,0 +1,46 @@ +{ + "abstracts": [], + "container_id": "oyuolsoctnezfhswyufqrsbjou", + "contribs": [ + { + "index": 0, + "raw_name": "José Eymard H. Pittella", + "role": "author" + }, + { + "index": 1, + "raw_name": "Alfredo J. A. Barbosa", + "role": "author" + } + ], + "ext_ids": { + "doaj": "9f88dd0f9f5848359eeef3d5e8663084" + }, + "extra": { + "country": "br", + "doaj": { + "subject": [ + { + "code": "RB1-214", + "scheme": "LCC", + "term": "Pathology" + } + ] + }, + "release_month": 12 + }, + "ident": "c5ui7bgop5bfpk67l7ngvr4uxy", + "language": "en", + "license_slug": "cc-by", + "pages": "406-407", + "publisher": "Sociedade Brasileira de Patologia Clínica", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2012, + "revision": "06b028af-5a12-41ae-8a5c-5d0aa4323f69", + "state": "active", + "title": "A nova classificação dos tumores da mama da OMS The new WHO classification of breast tumors", + "volume": "48", + "work_id": "jgchw45ryrd5xei2spovs3wntm" +} diff --git a/tests/data/release/qjavgacwznftlancwzxldtjt6y b/tests/data/release/qjavgacwznftlancwzxldtjt6y new file mode 100644 index 0000000..96a52fb --- /dev/null +++ b/tests/data/release/qjavgacwznftlancwzxldtjt6y @@ -0,0 +1,43 @@ +{ + "abstracts": [], + "container_id": "5idg4ohjqfggjd2vdgxino7424", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "N. G. N. L.", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1093/nq/s7-i.25.498c" + }, + "extra": { + "crossref": { + "subject": [ + "Linguistics and Language", + "Literature and Literary Theory", + "Library and Information Sciences", + "Language and Linguistics" + ], + "type": "journal-article" + } + }, + "ident": "qjavgacwznftlancwzxldtjt6y", + "issue": "25", + "language": "en", + "pages": "498-498", + "publisher": "Oxford University Press (OUP)", + "refs": [], + "release_date": "1886-06-19", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1886, + "revision": "0c45d3fa-589b-4107-9e1e-950dcd68861c", + "state": "active", + "title": "\"Deux oreilles\"", + "volume": "s7-I", + "work_id": "aoludylnmrgppdmrblnakpbm2m" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 5822a9e..5b9bc7e 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -244,3 +244,5 @@ r5ey6krhrfbjrgvqqdzgora32m,wfjdwsrr2rgd5aa3hsapivkp3m,Status.DIFFERENT, uvvfqlbtezh45ctyqrwqwfxlo4,wfjdwsrr2rgd5aa3hsapivkp3m,Status.DIFFERENT, ex2u4mgrpffp3asznccqd6n35q,zwpq2nocbzcixl6sswabsjg4ti,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY 2wx322n7pvbyxnbbrqrvbp7p74,lhgtefitvbd5lf6prb76mrgcci,Status.STRONG,JACCARD_AUTHORS +4mvqmzsmofckjdcy7zippajceu,qjavgacwznftlancwzxldtjt6y,Status.DIFFERENT,SHARED_DOI_PREFIX +2fok7qpz5nbofay762elgqklwa,c5ui7bgop5bfpk67l7ngvr4uxy,Status.STRONG,TITLE_ARTIFACT |