summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2021-06-10 14:28:05 +0000
committerMartin Czygan <martin@archive.org>2021-06-10 14:28:05 +0000
commitb67cac61d815634969e91999d4bc2954b99bf2b2 (patch)
tree273a60ae8c061137139f12fe63fa43a52f8a27a6 /python/fatcat_tools
parent9779781e6cb2b80d646b6d9e91c190d1faad503f (diff)
parente6490032e7c3c5bea97c66701aa28abd12b94973 (diff)
downloadfatcat-b67cac61d815634969e91999d4bc2954b99bf2b2.tar.gz
fatcat-b67cac61d815634969e91999d4bc2954b99bf2b2.zip
Merge branch 'bnewbold-clean-doi-lower' into 'master'
clean_doi() should lower-case returned DOI See merge request webgroup/fatcat!107
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/normal.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 4218856c..342edeef 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -22,13 +22,15 @@ def clean_doi(raw):
- 'doi:' prefix
- URL prefix
+ Lower-cases the DOI.
+
Does not try to un-URL-encode
Returns None if not a valid DOI
"""
if not raw:
return None
- raw = raw.strip()
+ raw = raw.strip().lower()
if '\u2013' in raw:
# Do not attempt to normalize "en dash" and since FC does not allow
# unicode in DOI, treat this as invalid.
@@ -84,6 +86,7 @@ def test_clean_doi():
assert clean_doi("10.4025/diálogos.v17i2.36030") == None
assert clean_doi("10.19027/jai.10.106‒115") == None
assert clean_doi("10.15673/атбп2312-3125.17/2014.26332") == None
+ assert clean_doi("10.7326/M20-6817") == "10.7326/m20-6817"
ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")