aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/utils.py4
-rw-r--r--fuzzycat/verify.py15
-rw-r--r--tests/data/release/iitldffmnncijgnf6ujb6zmdfu30
-rw-r--r--tests/data/release/ppnzru2opnhxlai7pcmo7phe4i30
-rw-r--r--tests/data/verify.csv1
5 files changed, 71 insertions, 9 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 4d1325d..1cac668 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,14 +1,13 @@
import io
import itertools
-import re
import string
+import re
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
-
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
@@ -90,3 +89,4 @@ def contains_chemical_formula(s):
for token in s.split():
if CHEM_FORMULA.search(token):
return True
+
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index ab26603..2bb4adb 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -328,6 +328,14 @@ TITLE_FRAGMENT_BLACKLIST = set([
"untersuchung einzelner abdominaler regionen und organe",
])
+CONTAINER_NAME_BLACKLIST = set([
+ "crossref listing of deleted dois",
+])
+
+PUBLISHER_BLACKLIST = set([
+ "test accounts",
+])
+
# There titles appear too often, so ignore them for now.
TITLE_BLACKLIST = set([
"",
@@ -3526,10 +3534,3 @@ TITLE_BLACKLIST = set([
"週刊ダイヤモンド = diamond weekly 別冊",
])
-CONTAINER_NAME_BLACKLIST = set([
- "crossref listing of deleted dois",
-])
-
-PUBLISHER_BLACKLIST = set([
- "test accounts",
-])
diff --git a/tests/data/release/iitldffmnncijgnf6ujb6zmdfu b/tests/data/release/iitldffmnncijgnf6ujb6zmdfu
new file mode 100644
index 0000000..f2c1b84
--- /dev/null
+++ b/tests/data/release/iitldffmnncijgnf6ujb6zmdfu
@@ -0,0 +1,30 @@
+{
+ "abstracts": [],
+ "container_id": "qea6koy5crhvfgkjwgnwojjkrm",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1017/s000497270001981x"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "S000497270001981X"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "iitldffmnncijgnf6ujb6zmdfu",
+ "issue": "03",
+ "language": "en",
+ "pages": "f1",
+ "publisher": "Cambridge University Press (CUP)",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2001,
+ "revision": "39087ec0-0c3f-4353-9656-961c5f11be42",
+ "state": "active",
+ "title": "BAZ volume 64 issue 3 Cover and Front matter",
+ "volume": "64",
+ "work_id": "7bjigtuhhvh3nk3rvckd6w7emy"
+}
diff --git a/tests/data/release/ppnzru2opnhxlai7pcmo7phe4i b/tests/data/release/ppnzru2opnhxlai7pcmo7phe4i
new file mode 100644
index 0000000..9f68d90
--- /dev/null
+++ b/tests/data/release/ppnzru2opnhxlai7pcmo7phe4i
@@ -0,0 +1,30 @@
+{
+ "abstracts": [],
+ "container_id": "qea6koy5crhvfgkjwgnwojjkrm",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1017/s0004972700005785"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "S0004972700005785"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "ppnzru2opnhxlai7pcmo7phe4i",
+ "issue": "03",
+ "language": "en",
+ "pages": "f1",
+ "publisher": "Cambridge University Press (CUP)",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1982,
+ "revision": "e6f9dedb-b6c0-4bf4-b3f4-d3427aa30c62",
+ "state": "active",
+ "title": "BAZ volume 26 issue 3 Cover and Front matter",
+ "volume": "26",
+ "work_id": "4wwtp45b3nchpo2bs7pifwsb44"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 46df18e..732d401 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -83,3 +83,4 @@ jdtngtiz3bdqboypujoni2x3ry,byh7xr5qhjca3bw53ivdotck3e,Status.EXACT,
5lk635o65nc2tnkus3pkf2ggeq,hqrvhbvocvaabg6nr5p43tl3uq,TODO,
5lk635o65nc2tnkus3pkf2ggeq,zfwf3tefajc6zdxa47vgilm7wm,TODO,
hqrvhbvocvaabg6nr5p43tl3uq,zfwf3tefajc6zdxa47vgilm7wm,TODO,
+ppnzru2opnhxlai7pcmo7phe4i,iitldffmnncijgnf6ujb6zmdfu,Status.DIFFERENT,Miss.NUM_DIFF