diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:36:27 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:36:27 -0700 |
commit | 3b4d87826eea9b827fdd8b824e6cd738d89db5bd (patch) | |
tree | 985f29c7c21bb732209ca66fa58123125248940f /fatcat_scholar/identifiers.py | |
parent | 5a1d53e8705d5ea59ea0c007f8a53940a353000b (diff) | |
download | fatcat-scholar-3b4d87826eea9b827fdd8b824e6cd738d89db5bd.tar.gz fatcat-scholar-3b4d87826eea9b827fdd8b824e6cd738d89db5bd.zip |
refs: clean up GROBID DOIs and PMCIDs
Diffstat (limited to 'fatcat_scholar/identifiers.py')
-rw-r--r-- | fatcat_scholar/identifiers.py | 16 |
1 files changed, 6 insertions, 10 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py index 583c8e6..7572e20 100644 --- a/fatcat_scholar/identifiers.py +++ b/fatcat_scholar/identifiers.py @@ -24,16 +24,10 @@ def clean_doi(raw: Optional[str]) -> Optional[str]: return None if len(raw.split()) != 1: return None - if raw.startswith("doi:"): - raw = raw[4:] - if raw.startswith("http://"): - raw = raw[7:] - if raw.startswith("https://"): - raw = raw[8:] - if raw.startswith("doi.org/"): - raw = raw[8:] - if raw.startswith("dx.doi.org/"): - raw = raw[11:] + if not "10." in raw: + return None + if not raw.startswith("10."): + raw = raw[raw.find("10."):] if raw[7:9] == "//": raw = raw[:8] + raw[9:] @@ -75,6 +69,8 @@ def test_clean_doi() -> None: assert ( clean_doi("10.6002/ect.2020.häyry") == None ) # this example via pubmed (pmid:32519616) + # GROBID mangled DOI + assert clean_doi("21924DOI10.1234/asdf ") == "10.1234/asdf" def clean_pmcid(raw: Optional[str]) -> Optional[str]: |