diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-06-07 14:56:56 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-06-07 14:56:59 -0700 |
commit | e6490032e7c3c5bea97c66701aa28abd12b94973 (patch) | |
tree | 273a60ae8c061137139f12fe63fa43a52f8a27a6 | |
parent | 9779781e6cb2b80d646b6d9e91c190d1faad503f (diff) | |
download | fatcat-e6490032e7c3c5bea97c66701aa28abd12b94973.tar.gz fatcat-e6490032e7c3c5bea97c66701aa28abd12b94973.zip |
clean_doi() should lower-case returned DOI
Code in a number of places (including Pubmed importer) assumed that this
was already lower-casing DOIs, resulting in some broken metadata getting
created.
See also: https://github.com/internetarchive/fatcat/issues/83
This is just the first step of mitigation.
-rw-r--r-- | python/fatcat_tools/normal.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 4218856c..342edeef 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -22,13 +22,15 @@ def clean_doi(raw): - 'doi:' prefix - URL prefix + Lower-cases the DOI. + Does not try to un-URL-encode Returns None if not a valid DOI """ if not raw: return None - raw = raw.strip() + raw = raw.strip().lower() if '\u2013' in raw: # Do not attempt to normalize "en dash" and since FC does not allow # unicode in DOI, treat this as invalid. @@ -84,6 +86,7 @@ def test_clean_doi(): assert clean_doi("10.4025/diálogos.v17i2.36030") == None assert clean_doi("10.19027/jai.10.106‒115") == None assert clean_doi("10.15673/атбп2312-3125.17/2014.26332") == None + assert clean_doi("10.7326/M20-6817") == "10.7326/m20-6817" ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") |