From 9b5e1827331f1c609b3af9fef6067dc91bad53f2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Feb 2021 18:09:25 -0800 Subject: make identifiers a bit more case-insensitive This only works for a subset of query patterns. --- fatcat_scholar/identifiers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py index 34e9ebb..583c8e6 100644 --- a/fatcat_scholar/identifiers.py +++ b/fatcat_scholar/identifiers.py @@ -17,7 +17,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]: """ if not raw: return None - raw = raw.strip() + raw = raw.strip().lower() if "\u2013" in raw: # Do not attempt to normalize "en dash" and since FC does not allow # unicode in DOI, treat this as invalid. @@ -62,6 +62,7 @@ def test_clean_doi() -> None: assert clean_doi("asdf") == None assert clean_doi("10.123") == None assert clean_doi("10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("10.1234/ASdf ") == "10.1234/asdf" assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" assert clean_doi("10.23750/abm.v88i2 -s.6506") == None @@ -79,7 +80,7 @@ def test_clean_doi() -> None: def clean_pmcid(raw: Optional[str]) -> Optional[str]: if not raw: return None - raw = raw.strip() + raw = raw.strip().upper() if len(raw.split()) != 1: return None if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit(): @@ -93,3 +94,4 @@ def test_clean_pmcid() -> None: assert clean_pmcid("1 2") == None assert clean_pmcid(None) == None assert clean_pmcid("PMC123") == "PMC123" + assert clean_pmcid("pmc123") == "PMC123" -- cgit v1.2.3