From 55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 11 Jun 2019 10:43:54 -0700
Subject: start work on 'generic' search box

---
 python/fatcat_tools/normal.py | 95 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 python/fatcat_tools/normal.py

(limited to 'python/fatcat_tools/normal.py')

diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
new file mode 100644
index 00000000..044ab87d
--- /dev/null
+++ b/python/fatcat_tools/normal.py
@@ -0,0 +1,95 @@
+
+"""
+A bunch of helpers to parse and normalize strings: external identifiers,
+free-form input, titles, etc.
+"""
+
+import re
+
+
+def clean_doi(raw):
+    """
+    Removes any:
+    - padding whitespace
+    - 'doi:' prefix
+    - URL prefix
+
+    Does not try to un-URL-encode
+
+    Returns None if not a valid DOI
+    """
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    if raw.startswith("doi:"):
+        raw = raw[4:]
+    if raw.startswith("http://"):
+        raw = raw[7:]
+    if raw.startswith("https://"):
+        raw = raw[8:]
+    if raw.startswith("doi.org/"):
+        raw = raw[8:]
+    if raw.startswith("dx.doi.org/"):
+        raw = raw[11:]
+    if not raw.startswith("10."):
+        return None
+    # TODO: actual regex
+    return raw
+
+def test_clean_doi():
+    assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
+    assert clean_doi("doi:10.1234/ asdf ") == None
+
+def clean_arxiv_id(raw):
+    """
+    Removes any:
+    - 'arxiv:' prefix
+
+    Works with versioned or un-versioned arxiv identifiers.
+    """
+    pass
+
+def test_clean_arxiv_id():
+    pass
+
+def clean_pmcid(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit():
+        return raw
+    return None
+
+def clean_sha1(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    pass
+
+def clean_issn(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    if len(raw) == 9 and raw[4] == "-" and raw[0:4].isdigit():
+        return raw
+    return None
+
+def test_clean_issn():
+    assert clean_issn("1234-4567") == "1234-4567"
+    assert clean_issn("134-4567") == None
+    assert clean_issn("123X-4567") == None
+
+def clean_isbn13(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    return None
+
+def clean_orcid(raw):
+    raw = raw.strip()
+    if len(raw.split()) != 1:
+        return None
+    return None
-- 
cgit v1.2.3