diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-11-04 21:57:16 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-11-04 21:57:16 -0800 |
commit | 5afde4690a4653db53fe4962af5da3eb9188d9a2 (patch) | |
tree | e88fa194620e4c70ff65402d570c8d1ff90fc189 /python/fatcat_tools | |
parent | bf77adc854022213951daec14bd904f483f21202 (diff) | |
download | fatcat-5afde4690a4653db53fe4962af5da3eb9188d9a2.tar.gz fatcat-5afde4690a4653db53fe4962af5da3eb9188d9a2.zip |
normalizer: filter out a specific non-ASCII character in DOI
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/normal.py | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index e65af8d6..10a90dba 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -55,7 +55,7 @@ def clean_doi(raw): # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but # for now block specific characters so we can get PubMed importer running # again. - if 'ä' in raw: + if 'ä' in raw or '\u200e' in raw: return None return raw @@ -71,6 +71,8 @@ def test_clean_doi(): assert clean_doi("doi:10.1234/ asdf ") == None assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character assert clean_doi("10.6002/ect.2020.häyry") == None # this example via pubmed (pmid:32519616) + assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None + ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") |