start work on 'generic' search box

author: Bryan Newbold <bnewbold@robocracy.org> 2019-06-11 10:43:54 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-06-13 14:36:59 -0700
commit: 55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2 (patch)
tree: 57f12af517a33c41f37de016d555add00bf56c58 /python/fatcat_tools/normal.py
parent: 016315f69a03473625d4d8ea3c450eb814e26911 (diff)
download: fatcat-55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2.tar.gz
fatcat-55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2.zip
1 files changed, 95 insertions, 0 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
new file mode 100644
index 00000000..044ab87d
--- /dev/null
+++ b/python/fatcat_tools/normal.py
@@ -0,0 +1,95 @@
+
+"""
+A bunch of helpers to parse and normalize strings: external identifiers,
+free-form input, titles, etc.
+"""
+
+import re
+
+
+def clean_doi(raw):
+    """
+    Removes any:
+    - padding whitespace
+    - 'doi:' prefix
+    - URL prefix
+
+    Does not try to un-URL-encode
+
+    Returns None if not a valid DOI
+    """
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    if raw.startswith("doi:"):
+        raw = raw[4:]
+    if raw.startswith("http://"):
+        raw = raw[7:]
+    if raw.startswith("https://"):
+        raw = raw[8:]
+    if raw.startswith("doi.org/"):
+        raw = raw[8:]
+    if raw.startswith("dx.doi.org/"):
+        raw = raw[11:]
+    if not raw.startswith("10."):
+        return None
+    # TODO: actual regex
+    return raw
+
+def test_clean_doi():
+    assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("doi:10.1234/ asdf ") == None
+
+def clean_arxiv_id(raw):
+    """
+    Removes any:
+    - 'arxiv:' prefix
+
+    Works with versioned or un-versioned arxiv identifiers.
+    """
+    pass
+
+def test_clean_arxiv_id():
+    pass
+
+def clean_pmcid(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit():
+        return raw
+    return None
+
+def clean_sha1(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    pass
+
+def clean_issn(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    if len(raw) == 9 and raw[4] == "-" and raw[0:4].isdigit():
+        return raw
+    return None
+
+def test_clean_issn():
+    assert clean_issn("1234-4567") == "1234-4567"
+    assert clean_issn("134-4567") == None
+    assert clean_issn("123X-4567") == None
+
+def clean_isbn13(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    return None
+
+def clean_orcid(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    return None
author	Bryan Newbold <bnewbold@robocracy.org>	2019-06-11 10:43:54 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-06-13 14:36:59 -0700
commit	55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2 (patch)
tree	57f12af517a33c41f37de016d555add00bf56c58 /python/fatcat_tools/normal.py
parent	016315f69a03473625d4d8ea3c450eb814e26911 (diff)
download	fatcat-55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2.tar.gz fatcat-55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2.zip