summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/normal.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-09-18 18:40:13 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-09-18 18:42:26 -0700
commit2e9194e0f2cf567ff719256fe80eb0d72b2434d7 (patch)
tree764af2718a0d071112affd58c0de4cd4873045f5 /python/fatcat_tools/normal.py
parent3e3f1c371c0f36b252315d07e788c37554170d89 (diff)
downloadfatcat-2e9194e0f2cf567ff719256fe80eb0d72b2434d7.tar.gz
fatcat-2e9194e0f2cf567ff719256fe80eb0d72b2434d7.zip
handle more external identifiers in python
This makes it possible to, eg, past an arxiv identifier or SHA-1 hash in the general search box and do a quick lookup.
Diffstat (limited to 'python/fatcat_tools/normal.py')
-rw-r--r--python/fatcat_tools/normal.py111
1 files changed, 97 insertions, 14 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 044ab87d..80bcfa5a 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -6,6 +6,7 @@ free-form input, titles, etc.
import re
+DOI_REGEX = re.compile("^10.\d{3,6}/\S+$")
def clean_doi(raw):
"""
@@ -33,7 +34,8 @@ def clean_doi(raw):
raw = raw[11:]
if not raw.startswith("10."):
return None
- # TODO: actual regex
+ if not DOI_REGEX.fullmatch(raw):
+ return None
return raw
def test_clean_doi():
@@ -43,6 +45,8 @@ def test_clean_doi():
assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
assert clean_doi("doi:10.1234/ asdf ") == None
+ARXIV_ID_REGEX = re.compile("^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
+
def clean_arxiv_id(raw):
"""
Removes any:
@@ -50,10 +54,41 @@ def clean_arxiv_id(raw):
Works with versioned or un-versioned arxiv identifiers.
"""
- pass
+ raw = raw.strip()
+ if raw.lower().startswith("arxiv:"):
+ raw = raw[6:]
+ if raw.lower().startswith("https://arxiv.org/abs/"):
+ raw = raw[22:]
+ if not ARXIV_ID_REGEX.fullmatch(raw):
+ return None
+ return raw
def test_clean_arxiv_id():
- pass
+ assert clean_arxiv_id("0806.2878v1") == "0806.2878v1"
+ assert clean_arxiv_id("0806.2878") == "0806.2878"
+ assert clean_arxiv_id("1501.00001v1") == "1501.00001v1"
+ assert clean_arxiv_id("1501.00001") == "1501.00001"
+ assert clean_arxiv_id("hep-th/9901001v1") == "hep-th/9901001v1"
+ assert clean_arxiv_id("hep-th/9901001") == "hep-th/9901001"
+ assert clean_arxiv_id("math.CA/0611800v2") == "math.CA/0611800v2"
+ assert clean_arxiv_id("math.CA/0611800") == "math.CA/0611800"
+ assert clean_arxiv_id("0806.2878v1 ") == "0806.2878v1"
+
+ assert clean_arxiv_id("https://arxiv.org/abs/0806.2878v1") == "0806.2878v1"
+ assert clean_arxiv_id("arxiv:0806.2878v1") == "0806.2878v1"
+ assert clean_arxiv_id("arXiv:0806.2878v1") == "0806.2878v1"
+
+ assert clean_arxiv_id("hep-TH/9901001v1") == None
+ assert clean_arxiv_id("hßp-th/9901001v1") == None
+ assert clean_arxiv_id("math.CA/06l1800v2") == None
+ assert clean_arxiv_id("mßth.ca/0611800v2") == None
+ assert clean_arxiv_id("MATH.CA/0611800v2") == None
+ assert clean_arxiv_id("0806.2878v23") == "0806.2878v23" # ?
+ assert clean_arxiv_id("0806.2878v") == None
+ assert clean_arxiv_id("0806.2878") == "0806.2878"
+ assert clean_arxiv_id("006.2878v1") == None
+ assert clean_arxiv_id("0806.v1") == None
+ assert clean_arxiv_id("08062878v1") == None
def clean_pmcid(raw):
raw = raw.strip()
@@ -64,32 +99,80 @@ def clean_pmcid(raw):
return None
def clean_sha1(raw):
- raw = raw.strip()
+ raw = raw.strip().lower()
if len(raw.split()) != 1:
return None
- pass
+ if len(raw) != 40:
+ return None
+ for c in raw:
+ if c not in "0123456789abcdef":
+ return None
+ return raw
-def clean_issn(raw):
- raw = raw.strip()
+def test_clean_sha1():
+ assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+ assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+ assert clean_sha1("fba3fba0e1937aa0297de3836b768b5dfb23d7b") == None
+ assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") == None
+ assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") == None
+
+def clean_sha256(raw):
+ raw = raw.strip().lower()
if len(raw.split()) != 1:
return None
- if len(raw) == 9 and raw[4] == "-" and raw[0:4].isdigit():
- return raw
- return None
+ if len(raw) != 64:
+ return None
+ for c in raw:
+ if c not in "0123456789abcdef":
+ return None
+ return raw
+
+def test_clean_sha256():
+ assert clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f"
+ assert clean_sha256("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == None
+
+ISSN_REGEX = re.compile("^\d{4}-\d{3}[0-9X]$")
+
+def clean_issn(raw):
+ raw = raw.strip().upper()
+ if len(raw) != 9:
+ return None
+ if not ISSN_REGEX.fullmatch(raw):
+ return None
+ return raw
def test_clean_issn():
assert clean_issn("1234-4567") == "1234-4567"
+ assert clean_issn("1234-456X") == "1234-456X"
assert clean_issn("134-4567") == None
assert clean_issn("123X-4567") == None
+ISBN13_REGEX = re.compile("^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$")
+
def clean_isbn13(raw):
raw = raw.strip()
- if len(raw.split()) != 1:
+ if not ISBN13_REGEX.fullmatch(raw):
return None
- return None
+ return raw
+
+def test_clean_isbn13():
+ assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4"
+ assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6"
+ assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4"
+ assert clean_isbn13("9781566199094") == None
+
+ORCID_REGEX = re.compile("^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")
def clean_orcid(raw):
raw = raw.strip()
- if len(raw.split()) != 1:
+ if not ORCID_REGEX.fullmatch(raw):
return None
- return None
+ return raw
+
+def test_clean_orcid():
+ assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789"
+ assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X"
+ assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789"
+ assert clean_orcid("01234567-3456-6780") == None
+ assert clean_orcid("0x23-4567-3456-6780") == None
+