diff options
Diffstat (limited to 'fatcat_scholar/identifiers.py')
-rw-r--r-- | fatcat_scholar/identifiers.py | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py new file mode 100644 index 0000000..2ea09c7 --- /dev/null +++ b/fatcat_scholar/identifiers.py @@ -0,0 +1,90 @@ +import re +from typing import Optional + +DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$") + + +def clean_doi(raw: Optional[str]) -> Optional[str]: + """ + Removes any: + - padding whitespace + - 'doi:' prefix + - URL prefix + + Does not try to un-URL-encode + + Returns None if not a valid DOI + """ + if not raw: + return None + raw = raw.strip() + if "\u2013" in raw: + # Do not attempt to normalize "en dash" and since FC does not allow + # unicode in DOI, treat this as invalid. + return None + if len(raw.split()) != 1: + return None + if raw.startswith("doi:"): + raw = raw[4:] + if raw.startswith("http://"): + raw = raw[7:] + if raw.startswith("https://"): + raw = raw[8:] + if raw.startswith("doi.org/"): + raw = raw[8:] + if raw.startswith("dx.doi.org/"): + raw = raw[11:] + if raw[7:9] == "//": + raw = raw[:8] + raw[9:] + + # fatcatd uses same REGEX, but Rust regex rejects these characters, while + # python doesn't. DOIs are syntaxtually valid, but very likely to be typos; + # for now filter them out. + for c in ("¬",): + if c in raw: + return None + + if not raw.startswith("10."): + return None + if not DOI_REGEX.fullmatch(raw): + return None + # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but + # for now block specific characters so we can get PubMed importer running + # again. + if "ä" in raw: + return None + return raw + + +def test_clean_doi() -> None: + assert clean_doi("10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.23750/abm.v88i2 -s.6506") == None + assert clean_doi("10.17167/mksz.2017.2.129–155") == None + assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("doi:10.1234/ asdf ") == None + assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character + assert ( + clean_doi("10.6002/ect.2020.häyry") == None + ) # this example via pubmed (pmid:32519616) + + +def clean_pmcid(raw: Optional[str]) -> Optional[str]: + if not raw: + return None + raw = raw.strip() + if len(raw.split()) != 1: + return None + if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit(): + return raw + return None + + +def test_clean_pmcid() -> None: + assert clean_pmcid("10.1234/asdf ") == None + assert clean_pmcid("") == None + assert clean_pmcid(None) == None + assert clean_pmcid("PMC123") == "PMC123" |