summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/identifiers.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/identifiers.py')
-rw-r--r--fatcat_scholar/identifiers.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py
new file mode 100644
index 0000000..2ea09c7
--- /dev/null
+++ b/fatcat_scholar/identifiers.py
@@ -0,0 +1,90 @@
+import re
+from typing import Optional
+
+DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
+
+
+def clean_doi(raw: Optional[str]) -> Optional[str]:
+ """
+ Removes any:
+ - padding whitespace
+ - 'doi:' prefix
+ - URL prefix
+
+ Does not try to un-URL-encode
+
+ Returns None if not a valid DOI
+ """
+ if not raw:
+ return None
+ raw = raw.strip()
+ if "\u2013" in raw:
+ # Do not attempt to normalize "en dash" and since FC does not allow
+ # unicode in DOI, treat this as invalid.
+ return None
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("doi:"):
+ raw = raw[4:]
+ if raw.startswith("http://"):
+ raw = raw[7:]
+ if raw.startswith("https://"):
+ raw = raw[8:]
+ if raw.startswith("doi.org/"):
+ raw = raw[8:]
+ if raw.startswith("dx.doi.org/"):
+ raw = raw[11:]
+ if raw[7:9] == "//":
+ raw = raw[:8] + raw[9:]
+
+ # fatcatd uses same REGEX, but Rust regex rejects these characters, while
+ # python doesn't. DOIs are syntaxtually valid, but very likely to be typos;
+ # for now filter them out.
+ for c in ("¬",):
+ if c in raw:
+ return None
+
+ if not raw.startswith("10."):
+ return None
+ if not DOI_REGEX.fullmatch(raw):
+ return None
+ # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
+ # for now block specific characters so we can get PubMed importer running
+ # again.
+ if "ä" in raw:
+ return None
+ return raw
+
+
+def test_clean_doi() -> None:
+ assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+ assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
+ assert clean_doi("10.23750/abm.v88i2 -s.6506") == None
+ assert clean_doi("10.17167/mksz.2017.2.129–155") == None
+ assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/ asdf ") == None
+ assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character
+ assert (
+ clean_doi("10.6002/ect.2020.häyry") == None
+ ) # this example via pubmed (pmid:32519616)
+
+
+def clean_pmcid(raw: Optional[str]) -> Optional[str]:
+ if not raw:
+ return None
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit():
+ return raw
+ return None
+
+
+def test_clean_pmcid() -> None:
+ assert clean_pmcid("10.1234/asdf ") == None
+ assert clean_pmcid("") == None
+ assert clean_pmcid(None) == None
+ assert clean_pmcid("PMC123") == "PMC123"