summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/identifiers.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-01 16:36:27 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-01 16:36:27 -0700
commit3b4d87826eea9b827fdd8b824e6cd738d89db5bd (patch)
tree985f29c7c21bb732209ca66fa58123125248940f /fatcat_scholar/identifiers.py
parent5a1d53e8705d5ea59ea0c007f8a53940a353000b (diff)
downloadfatcat-scholar-3b4d87826eea9b827fdd8b824e6cd738d89db5bd.tar.gz
fatcat-scholar-3b4d87826eea9b827fdd8b824e6cd738d89db5bd.zip
refs: clean up GROBID DOIs and PMCIDs
Diffstat (limited to 'fatcat_scholar/identifiers.py')
-rw-r--r--fatcat_scholar/identifiers.py16
1 files changed, 6 insertions, 10 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py
index 583c8e6..7572e20 100644
--- a/fatcat_scholar/identifiers.py
+++ b/fatcat_scholar/identifiers.py
@@ -24,16 +24,10 @@ def clean_doi(raw: Optional[str]) -> Optional[str]:
return None
if len(raw.split()) != 1:
return None
- if raw.startswith("doi:"):
- raw = raw[4:]
- if raw.startswith("http://"):
- raw = raw[7:]
- if raw.startswith("https://"):
- raw = raw[8:]
- if raw.startswith("doi.org/"):
- raw = raw[8:]
- if raw.startswith("dx.doi.org/"):
- raw = raw[11:]
+ if not "10." in raw:
+ return None
+ if not raw.startswith("10."):
+ raw = raw[raw.find("10."):]
if raw[7:9] == "//":
raw = raw[:8] + raw[9:]
@@ -75,6 +69,8 @@ def test_clean_doi() -> None:
assert (
clean_doi("10.6002/ect.2020.häyry") == None
) # this example via pubmed (pmid:32519616)
+ # GROBID mangled DOI
+ assert clean_doi("21924DOI10.1234/asdf ") == "10.1234/asdf"
def clean_pmcid(raw: Optional[str]) -> Optional[str]: