From 5afde4690a4653db53fe4962af5da3eb9188d9a2 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 4 Nov 2020 21:57:16 -0800
Subject: normalizer: filter out a specific non-ASCII character in DOI

---
 python/fatcat_tools/normal.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index e65af8d6..10a90dba 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -55,7 +55,7 @@ def clean_doi(raw):
     # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
     # for now block specific characters so we can get PubMed importer running
     # again.
-    if 'ä' in raw:
+    if 'ä' in raw or '\u200e' in raw:
         return None
     return raw
 
@@ -71,6 +71,8 @@ def test_clean_doi():
     assert clean_doi("doi:10.1234/ asdf ") == None
     assert clean_doi("10.4149/gpb¬_2017042") == None  # "logical negation" character
     assert clean_doi("10.6002/ect.2020.häyry") == None  # this example via pubmed (pmid:32519616)
+    assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None
+
 
 ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
 
-- 
cgit v1.2.3