aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-11-19 17:09:54 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-11-19 17:09:56 -0800
commit6844071983fb1c9cfc0717247e933e23dead51ca (patch)
treed3b9d6d08ce219ab62e6f6c54b1613de6d2bb871 /python/fatcat_tools
parente935725539c1b66d8e8cb917d37aed959921418e (diff)
downloadfatcat-6844071983fb1c9cfc0717247e933e23dead51ca.tar.gz
fatcat-6844071983fb1c9cfc0717247e933e23dead51ca.zip
clean DOI: ban all non-ASCII characters
I believe this is safe and matches the regex filter in rust (fatcatd). Keep hitting one-off DOIs that were passing through python check, so being more strict from here forward.
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/normal.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index dea6da59..aae3ac68 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -62,7 +62,8 @@ def clean_doi(raw):
# will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
# for now block specific characters so we can get PubMed importer running
# again.
- if 'ä' in raw or 'á' in raw or '\u200e' in raw or '\u2043' in raw:
+ # known characters to skip: ä á \u200e \u2043 \u2012
+ if not raw.isascii():
return None
return raw
@@ -81,6 +82,8 @@ def test_clean_doi():
assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None
assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") == None
assert clean_doi("10.4025/diálogos.v17i2.36030") == None
+ assert clean_doi("10.19027/jai.10.106‒115") == None
+ assert clean_doi("10.15673/атбп2312-3125.17/2014.26332") == None
ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")