aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-25 17:47:29 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-25 17:47:29 +0100
commit375fa3f9ebca09de596f89ae6b137d905dbb5ba7 (patch)
treeff95d2c4e71df061830a5e70cbf7c944cc8ec68c /fuzzycat
parentf67e14fdb7ab6cad06b36a532e51eb309001a66f (diff)
downloadfuzzycat-375fa3f9ebca09de596f89ae6b137d905dbb5ba7.tar.gz
fuzzycat-375fa3f9ebca09de596f89ae6b137d905dbb5ba7.zip
add another test case
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/utils.py4
-rw-r--r--fuzzycat/verify.py15
2 files changed, 10 insertions, 9 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 4d1325d..1cac668 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,14 +1,13 @@
import io
import itertools
-import re
import string
+import re
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
-
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
@@ -90,3 +89,4 @@ def contains_chemical_formula(s):
for token in s.split():
if CHEM_FORMULA.search(token):
return True
+
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index ab26603..2bb4adb 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -328,6 +328,14 @@ TITLE_FRAGMENT_BLACKLIST = set([
"untersuchung einzelner abdominaler regionen und organe",
])
+CONTAINER_NAME_BLACKLIST = set([
+ "crossref listing of deleted dois",
+])
+
+PUBLISHER_BLACKLIST = set([
+ "test accounts",
+])
+
# There titles appear too often, so ignore them for now.
TITLE_BLACKLIST = set([
"",
@@ -3526,10 +3534,3 @@ TITLE_BLACKLIST = set([
"週刊ダイヤモンド = diamond weekly 別冊",
])
-CONTAINER_NAME_BLACKLIST = set([
- "crossref listing of deleted dois",
-])
-
-PUBLISHER_BLACKLIST = set([
- "test accounts",
-])