summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-19 09:52:02 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-19 09:58:18 -0800
commita73f751d9eea2b4eaca780e940f324c9c07119a2 (patch)
tree821487623c64ee75b554dd9b317a52d6e2aeaaf9
parentb9842a63ccfea954b477c0abfbe168a236d15f23 (diff)
downloadfatcat-a73f751d9eea2b4eaca780e940f324c9c07119a2.tar.gz
fatcat-a73f751d9eea2b4eaca780e940f324c9c07119a2.zip
normal: DOI corner-case from pubmed import
-rw-r--r--python/fatcat_tools/normal.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 7b4bd19c..7a2b5fd9 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -40,6 +40,14 @@ def clean_doi(raw):
raw = raw[11:]
if raw[7:9] == "//":
raw = raw[:8] + raw[9:]
+
+ # fatcatd uses same REGEX, but Rust regex rejects these characters, while
+ # python doesn't. DOIs are syntaxtually valid, but very likely to be typos;
+ # for now filter them out.
+ for c in ('¬', ):
+ if c in raw:
+ return None
+
if not raw.startswith("10."):
return None
if not DOI_REGEX.fullmatch(raw):
@@ -56,6 +64,7 @@ def test_clean_doi():
assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
assert clean_doi("doi:10.1234/ asdf ") == None
+ assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character
ARXIV_ID_REGEX = re.compile("^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")