aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/verify.py10
-rw-r--r--tests/data/release/2fok7qpz5nbofay762elgqklwa43
-rw-r--r--tests/data/release/4mvqmzsmofckjdcy7zippajceu43
-rw-r--r--tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy46
-rw-r--r--tests/data/release/qjavgacwznftlancwzxldtjt6y43
-rw-r--r--tests/data/verify.csv2
7 files changed, 188 insertions, 0 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index 79e2b14..e736de6 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -58,6 +58,7 @@ class Reason(str, Enum):
SINGULAR_CLUSTER = 'singular_cluster'
SLUG_TITLE_AUTHOR_MATCH = 'slug_title_author_match'
SUBTITLE = 'subtitle'
+ TITLE_ARTIFACT = 'title_artifact'
TITLE_AUTHOR_MATCH = 'title_author_match'
TITLE_FILENAME = 'title_filename'
TOKENIZED_AUTHORS = 'tokenized_authors'
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index ff4567b..a44154a 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -596,4 +596,14 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
except (ValueError, PathAccessError):
pass
+ # A variant of translated titles, e.g. https://fatcat.wiki/release/search?q=%22A+nova+classifica%C3%A7%C3%A3o+dos+tumores+da+mama+%22
+ try:
+ a_container_id = glom(a, "container_id")
+ b_container_id = glom(b, "container_id")
+ if a_authors == b_authors and a_container_id == b_container_id and a_release_year == b_release_year and a_title != b_title and (
+ a_title in b_title or b_title in a_title):
+ return Verify(Status.STRONG, Reason.TITLE_ARTIFACT)
+ except PathAccessError:
+ pass
+
return Verify(Status.AMBIGUOUS, Reason.UNKNOWN)
diff --git a/tests/data/release/2fok7qpz5nbofay762elgqklwa b/tests/data/release/2fok7qpz5nbofay762elgqklwa
new file mode 100644
index 0000000..58a1918
--- /dev/null
+++ b/tests/data/release/2fok7qpz5nbofay762elgqklwa
@@ -0,0 +1,43 @@
+{
+ "abstracts": [],
+ "container_id": "oyuolsoctnezfhswyufqrsbjou",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "José Eymard H. Pittella",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "Alfredo J. A. Barbosa",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1590/s1676-24442012000600002"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "S1676-24442012000600002"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "2fok7qpz5nbofay762elgqklwa",
+ "language": "en",
+ "pages": "406-407",
+ "publisher": "FapUNIFESP (SciELO)",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2012,
+ "revision": "3c528b1e-e308-439a-9c68-11436b028750",
+ "state": "active",
+ "title": "A nova classificação dos tumores da mama da OMS",
+ "volume": "48",
+ "work_id": "lxey46fwm5csnote5v3owqi7eq"
+}
diff --git a/tests/data/release/4mvqmzsmofckjdcy7zippajceu b/tests/data/release/4mvqmzsmofckjdcy7zippajceu
new file mode 100644
index 0000000..d19b988
--- /dev/null
+++ b/tests/data/release/4mvqmzsmofckjdcy7zippajceu
@@ -0,0 +1,43 @@
+{
+ "abstracts": [],
+ "container_id": "5idg4ohjqfggjd2vdgxino7424",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "W. C. B.",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1093/nq/s7-i.25.498e"
+ },
+ "extra": {
+ "crossref": {
+ "subject": [
+ "Linguistics and Language",
+ "Literature and Literary Theory",
+ "Library and Information Sciences",
+ "Language and Linguistics"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "4mvqmzsmofckjdcy7zippajceu",
+ "issue": "25",
+ "language": "en",
+ "pages": "498-498",
+ "publisher": "Oxford University Press (OUP)",
+ "refs": [],
+ "release_date": "1886-06-19",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1886,
+ "revision": "ad0594c6-8196-420c-b907-c2c534d1f059",
+ "state": "active",
+ "title": "\"Deux oreilles\"",
+ "volume": "s7-I",
+ "work_id": "5ord7xs65ra3fmec2e65l4w5c4"
+}
diff --git a/tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy b/tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy
new file mode 100644
index 0000000..c78d1f4
--- /dev/null
+++ b/tests/data/release/c5ui7bgop5bfpk67l7ngvr4uxy
@@ -0,0 +1,46 @@
+{
+ "abstracts": [],
+ "container_id": "oyuolsoctnezfhswyufqrsbjou",
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "José Eymard H. Pittella",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "Alfredo J. A. Barbosa",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doaj": "9f88dd0f9f5848359eeef3d5e8663084"
+ },
+ "extra": {
+ "country": "br",
+ "doaj": {
+ "subject": [
+ {
+ "code": "RB1-214",
+ "scheme": "LCC",
+ "term": "Pathology"
+ }
+ ]
+ },
+ "release_month": 12
+ },
+ "ident": "c5ui7bgop5bfpk67l7ngvr4uxy",
+ "language": "en",
+ "license_slug": "cc-by",
+ "pages": "406-407",
+ "publisher": "Sociedade Brasileira de Patologia Clínica",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2012,
+ "revision": "06b028af-5a12-41ae-8a5c-5d0aa4323f69",
+ "state": "active",
+ "title": "A nova classificação dos tumores da mama da OMS The new WHO classification of breast tumors",
+ "volume": "48",
+ "work_id": "jgchw45ryrd5xei2spovs3wntm"
+}
diff --git a/tests/data/release/qjavgacwznftlancwzxldtjt6y b/tests/data/release/qjavgacwznftlancwzxldtjt6y
new file mode 100644
index 0000000..96a52fb
--- /dev/null
+++ b/tests/data/release/qjavgacwznftlancwzxldtjt6y
@@ -0,0 +1,43 @@
+{
+ "abstracts": [],
+ "container_id": "5idg4ohjqfggjd2vdgxino7424",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "N. G. N. L.",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1093/nq/s7-i.25.498c"
+ },
+ "extra": {
+ "crossref": {
+ "subject": [
+ "Linguistics and Language",
+ "Literature and Literary Theory",
+ "Library and Information Sciences",
+ "Language and Linguistics"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "qjavgacwznftlancwzxldtjt6y",
+ "issue": "25",
+ "language": "en",
+ "pages": "498-498",
+ "publisher": "Oxford University Press (OUP)",
+ "refs": [],
+ "release_date": "1886-06-19",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1886,
+ "revision": "0c45d3fa-589b-4107-9e1e-950dcd68861c",
+ "state": "active",
+ "title": "\"Deux oreilles\"",
+ "volume": "s7-I",
+ "work_id": "aoludylnmrgppdmrblnakpbm2m"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 5822a9e..5b9bc7e 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -244,3 +244,5 @@ r5ey6krhrfbjrgvqqdzgora32m,wfjdwsrr2rgd5aa3hsapivkp3m,Status.DIFFERENT,
uvvfqlbtezh45ctyqrwqwfxlo4,wfjdwsrr2rgd5aa3hsapivkp3m,Status.DIFFERENT,
ex2u4mgrpffp3asznccqd6n35q,zwpq2nocbzcixl6sswabsjg4ti,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY
2wx322n7pvbyxnbbrqrvbp7p74,lhgtefitvbd5lf6prb76mrgcci,Status.STRONG,JACCARD_AUTHORS
+4mvqmzsmofckjdcy7zippajceu,qjavgacwznftlancwzxldtjt6y,Status.DIFFERENT,SHARED_DOI_PREFIX
+2fok7qpz5nbofay762elgqklwa,c5ui7bgop5bfpk67l7ngvr4uxy,Status.STRONG,TITLE_ARTIFACT