diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-19 09:52:02 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-19 09:58:18 -0800 |
commit | a73f751d9eea2b4eaca780e940f324c9c07119a2 (patch) | |
tree | 821487623c64ee75b554dd9b317a52d6e2aeaaf9 /python | |
parent | b9842a63ccfea954b477c0abfbe168a236d15f23 (diff) | |
download | fatcat-a73f751d9eea2b4eaca780e940f324c9c07119a2.tar.gz fatcat-a73f751d9eea2b4eaca780e940f324c9c07119a2.zip |
normal: DOI corner-case from pubmed import
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/normal.py | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 7b4bd19c..7a2b5fd9 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -40,6 +40,14 @@ def clean_doi(raw): raw = raw[11:] if raw[7:9] == "//": raw = raw[:8] + raw[9:] + + # fatcatd uses same REGEX, but Rust regex rejects these characters, while + # python doesn't. DOIs are syntaxtually valid, but very likely to be typos; + # for now filter them out. + for c in ('¬', ): + if c in raw: + return None + if not raw.startswith("10."): return None if not DOI_REGEX.fullmatch(raw): @@ -56,6 +64,7 @@ def test_clean_doi(): assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf" assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf" assert clean_doi("doi:10.1234/ asdf ") == None + assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character ARXIV_ID_REGEX = re.compile("^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") |