summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-11-17 18:38:06 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-11-19 14:55:15 -0800
commitc9bfb0be4c7e38b6668f49588f2ffecee7b17912 (patch)
tree930c516b76e45705d9f3184954a27316625138a6
parent90b336ec3fe2cf34b0cbbbf5717aa3883af8685e (diff)
downloadfatcat-c9bfb0be4c7e38b6668f49588f2ffecee7b17912.tar.gz
fatcat-c9bfb0be4c7e38b6668f49588f2ffecee7b17912.zip
handle more non-ASCII DOI cases
-rw-r--r--python/fatcat_tools/normal.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 39927651..ed439225 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -62,7 +62,7 @@ def clean_doi(raw):
# will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
# for now block specific characters so we can get PubMed importer running
# again.
- if 'ä' in raw or '\u200e' in raw:
+ if 'ä' in raw or 'á' in raw or '\u200e' in raw or '\u2043' in raw:
return None
return raw
@@ -79,6 +79,8 @@ def test_clean_doi():
assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character
assert clean_doi("10.6002/ect.2020.häyry") == None # this example via pubmed (pmid:32519616)
assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None
+ assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") == None
+ assert clean_doi("10.4025/diálogos.v17i2.36030") == None
ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")