diff options
| -rw-r--r-- | python/fatcat_tools/normal.py | 4 | 
1 files changed, 3 insertions, 1 deletions
| diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 39927651..ed439225 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -62,7 +62,7 @@ def clean_doi(raw):      # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but      # for now block specific characters so we can get PubMed importer running      # again. -    if 'ä' in raw or '\u200e' in raw: +    if 'ä' in raw or 'á' in raw or '\u200e' in raw or '\u2043' in raw:          return None      return raw @@ -79,6 +79,8 @@ def test_clean_doi():      assert clean_doi("10.4149/gpb¬_2017042") == None  # "logical negation" character      assert clean_doi("10.6002/ect.2020.häyry") == None  # this example via pubmed (pmid:32519616)      assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None +    assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") == None +    assert clean_doi("10.4025/diálogos.v17i2.36030") == None  ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") | 
