From e6490032e7c3c5bea97c66701aa28abd12b94973 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 7 Jun 2021 14:56:56 -0700
Subject: clean_doi() should lower-case returned DOI

Code in a number of places (including Pubmed importer) assumed that this
was already lower-casing DOIs, resulting in some broken metadata getting
created.

See also: https://github.com/internetarchive/fatcat/issues/83

This is just the first step of mitigation.
---
 python/fatcat_tools/normal.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 4218856c..342edeef 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -22,13 +22,15 @@ def clean_doi(raw):
     - 'doi:' prefix
     - URL prefix
 
+    Lower-cases the DOI.
+
     Does not try to un-URL-encode
 
     Returns None if not a valid DOI
     """
     if not raw:
         return None
-    raw = raw.strip()
+    raw = raw.strip().lower()
     if '\u2013' in raw:
         # Do not attempt to normalize "en dash" and since FC does not allow
         # unicode in DOI, treat this as invalid.
@@ -84,6 +86,7 @@ def test_clean_doi():
     assert clean_doi("10.4025/diálogos.v17i2.36030") == None
     assert clean_doi("10.19027/jai.10.106‒115") == None
     assert clean_doi("10.15673/атбп2312-3125.17/2014.26332") == None
+    assert clean_doi("10.7326/M20-6817") == "10.7326/m20-6817"
 
 
 ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
-- 
cgit v1.2.3