summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/normal.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
new file mode 100644
index 00000000..044ab87d
--- /dev/null
+++ b/python/fatcat_tools/normal.py
@@ -0,0 +1,95 @@
+
+"""
+A bunch of helpers to parse and normalize strings: external identifiers,
+free-form input, titles, etc.
+"""
+
+import re
+
+
+def clean_doi(raw):
+ """
+ Removes any:
+ - padding whitespace
+ - 'doi:' prefix
+ - URL prefix
+
+ Does not try to un-URL-encode
+
+ Returns None if not a valid DOI
+ """
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("doi:"):
+ raw = raw[4:]
+ if raw.startswith("http://"):
+ raw = raw[7:]
+ if raw.startswith("https://"):
+ raw = raw[8:]
+ if raw.startswith("doi.org/"):
+ raw = raw[8:]
+ if raw.startswith("dx.doi.org/"):
+ raw = raw[11:]
+ if not raw.startswith("10."):
+ return None
+ # TODO: actual regex
+ return raw
+
+def test_clean_doi():
+ assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/ asdf ") == None
+
+def clean_arxiv_id(raw):
+ """
+ Removes any:
+ - 'arxiv:' prefix
+
+ Works with versioned or un-versioned arxiv identifiers.
+ """
+ pass
+
+def test_clean_arxiv_id():
+ pass
+
+def clean_pmcid(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit():
+ return raw
+ return None
+
+def clean_sha1(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ pass
+
+def clean_issn(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if len(raw) == 9 and raw[4] == "-" and raw[0:4].isdigit():
+ return raw
+ return None
+
+def test_clean_issn():
+ assert clean_issn("1234-4567") == "1234-4567"
+ assert clean_issn("134-4567") == None
+ assert clean_issn("123X-4567") == None
+
+def clean_isbn13(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ return None
+
+def clean_orcid(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ return None