summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/normal.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-06-11 10:43:54 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-06-13 14:36:59 -0700
commit55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2 (patch)
tree57f12af517a33c41f37de016d555add00bf56c58 /python/fatcat_tools/normal.py
parent016315f69a03473625d4d8ea3c450eb814e26911 (diff)
downloadfatcat-55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2.tar.gz
fatcat-55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2.zip
start work on 'generic' search box
Diffstat (limited to 'python/fatcat_tools/normal.py')
-rw-r--r--python/fatcat_tools/normal.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
new file mode 100644
index 00000000..044ab87d
--- /dev/null
+++ b/python/fatcat_tools/normal.py
@@ -0,0 +1,95 @@
+
+"""
+A bunch of helpers to parse and normalize strings: external identifiers,
+free-form input, titles, etc.
+"""
+
+import re
+
+
+def clean_doi(raw):
+ """
+ Removes any:
+ - padding whitespace
+ - 'doi:' prefix
+ - URL prefix
+
+ Does not try to un-URL-encode
+
+ Returns None if not a valid DOI
+ """
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("doi:"):
+ raw = raw[4:]
+ if raw.startswith("http://"):
+ raw = raw[7:]
+ if raw.startswith("https://"):
+ raw = raw[8:]
+ if raw.startswith("doi.org/"):
+ raw = raw[8:]
+ if raw.startswith("dx.doi.org/"):
+ raw = raw[11:]
+ if not raw.startswith("10."):
+ return None
+ # TODO: actual regex
+ return raw
+
+def test_clean_doi():
+ assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
+ assert clean_doi("doi:10.1234/ asdf ") == None
+
+def clean_arxiv_id(raw):
+ """
+ Removes any:
+ - 'arxiv:' prefix
+
+ Works with versioned or un-versioned arxiv identifiers.
+ """
+ pass
+
+def test_clean_arxiv_id():
+ pass
+
+def clean_pmcid(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit():
+ return raw
+ return None
+
+def clean_sha1(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ pass
+
+def clean_issn(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ if len(raw) == 9 and raw[4] == "-" and raw[0:4].isdigit():
+ return raw
+ return None
+
+def test_clean_issn():
+ assert clean_issn("1234-4567") == "1234-4567"
+ assert clean_issn("134-4567") == None
+ assert clean_issn("123X-4567") == None
+
+def clean_isbn13(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ return None
+
+def clean_orcid(raw):
+ raw = raw.strip()
+ if len(raw.split()) != 1:
+ return None
+ return None