diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/normal.py | 111 | ||||
| -rw-r--r-- | python/fatcat_web/routes.py | 4 | 
2 files changed, 101 insertions, 14 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 044ab87d..80bcfa5a 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -6,6 +6,7 @@ free-form input, titles, etc.  import re +DOI_REGEX = re.compile("^10.\d{3,6}/\S+$")  def clean_doi(raw):      """ @@ -33,7 +34,8 @@ def clean_doi(raw):          raw = raw[11:]      if not raw.startswith("10."):          return None -    # TODO: actual regex +    if not DOI_REGEX.fullmatch(raw): +        return None      return raw  def test_clean_doi(): @@ -43,6 +45,8 @@ def test_clean_doi():      assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"      assert clean_doi("doi:10.1234/ asdf ") == None +ARXIV_ID_REGEX = re.compile("^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") +  def clean_arxiv_id(raw):      """      Removes any: @@ -50,10 +54,41 @@ def clean_arxiv_id(raw):      Works with versioned or un-versioned arxiv identifiers.      """ -    pass +    raw = raw.strip() +    if raw.lower().startswith("arxiv:"): +        raw = raw[6:] +    if raw.lower().startswith("https://arxiv.org/abs/"): +        raw = raw[22:] +    if not ARXIV_ID_REGEX.fullmatch(raw): +        return None +    return raw  def test_clean_arxiv_id(): -    pass +    assert clean_arxiv_id("0806.2878v1") == "0806.2878v1" +    assert clean_arxiv_id("0806.2878") == "0806.2878" +    assert clean_arxiv_id("1501.00001v1") == "1501.00001v1" +    assert clean_arxiv_id("1501.00001") == "1501.00001" +    assert clean_arxiv_id("hep-th/9901001v1") == "hep-th/9901001v1" +    assert clean_arxiv_id("hep-th/9901001") == "hep-th/9901001" +    assert clean_arxiv_id("math.CA/0611800v2") == "math.CA/0611800v2" +    assert clean_arxiv_id("math.CA/0611800") == "math.CA/0611800" +    assert clean_arxiv_id("0806.2878v1 ") == "0806.2878v1" + +    assert clean_arxiv_id("https://arxiv.org/abs/0806.2878v1") == "0806.2878v1" +    assert clean_arxiv_id("arxiv:0806.2878v1") == "0806.2878v1" +    assert clean_arxiv_id("arXiv:0806.2878v1") == "0806.2878v1" + +    assert clean_arxiv_id("hep-TH/9901001v1") == None +    assert clean_arxiv_id("hßp-th/9901001v1") == None +    assert clean_arxiv_id("math.CA/06l1800v2") == None +    assert clean_arxiv_id("mßth.ca/0611800v2") == None +    assert clean_arxiv_id("MATH.CA/0611800v2") == None +    assert clean_arxiv_id("0806.2878v23") == "0806.2878v23"  # ? +    assert clean_arxiv_id("0806.2878v") == None +    assert clean_arxiv_id("0806.2878") == "0806.2878" +    assert clean_arxiv_id("006.2878v1") == None +    assert clean_arxiv_id("0806.v1") == None +    assert clean_arxiv_id("08062878v1") == None  def clean_pmcid(raw):      raw = raw.strip() @@ -64,32 +99,80 @@ def clean_pmcid(raw):      return None  def clean_sha1(raw): -    raw = raw.strip() +    raw = raw.strip().lower()      if len(raw.split()) != 1:          return None -    pass +    if len(raw) != 40: +        return None +    for c in raw: +        if c not in "0123456789abcdef": +            return None +    return raw -def clean_issn(raw): -    raw = raw.strip() +def test_clean_sha1(): +    assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" +    assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" +    assert clean_sha1("fba3fba0e1937aa0297de3836b768b5dfb23d7b") == None +    assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") == None +    assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") == None + +def clean_sha256(raw): +    raw = raw.strip().lower()      if len(raw.split()) != 1:          return None -    if len(raw) == 9 and raw[4] == "-" and raw[0:4].isdigit(): -        return raw -    return None +    if len(raw) != 64: +        return None +    for c in raw: +        if c not in "0123456789abcdef": +            return None +    return raw + +def test_clean_sha256(): +    assert clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" +    assert clean_sha256("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == None + +ISSN_REGEX = re.compile("^\d{4}-\d{3}[0-9X]$") + +def clean_issn(raw): +    raw = raw.strip().upper() +    if len(raw) != 9: +        return None +    if not ISSN_REGEX.fullmatch(raw): +        return None +    return raw  def test_clean_issn():      assert clean_issn("1234-4567") == "1234-4567" +    assert clean_issn("1234-456X") == "1234-456X"      assert clean_issn("134-4567") == None      assert clean_issn("123X-4567") == None +ISBN13_REGEX = re.compile("^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$") +  def clean_isbn13(raw):      raw = raw.strip() -    if len(raw.split()) != 1: +    if not ISBN13_REGEX.fullmatch(raw):          return None -    return None +    return raw + +def test_clean_isbn13(): +    assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4" +    assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6" +    assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4" +    assert clean_isbn13("9781566199094") == None + +ORCID_REGEX = re.compile("^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")  def clean_orcid(raw):      raw = raw.strip() -    if len(raw.split()) != 1: +    if not ORCID_REGEX.fullmatch(raw):          return None -    return None +    return raw + +def test_clean_orcid(): +    assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789" +    assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X" +    assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789" +    assert clean_orcid("01234567-3456-6780") == None +    assert clean_orcid("0x23-4567-3456-6780") == None + diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 036dcea0..1213ce11 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -618,10 +618,14 @@ def generic_search():          return redirect(url_for('release_lookup', pmcid=clean_pmcid(query)))      if clean_sha1(query):          return redirect(url_for('file_lookup', sha1=clean_sha1(query))) +    if clean_sha256(query): +        return redirect(url_for('file_lookup', sha256=clean_sha256(query)))      if clean_issn(query):          return redirect(url_for('container_lookup', issnl=clean_issn(query)))      if clean_isbn13(query):          return redirect(url_for('release_lookup', isbn13=clean_isbn13(query))) +    if clean_arxiv_id(query): +        return redirect(url_for('release_lookup', arxiv=clean_arxiv_id(query)))      if clean_orcid(query):          return redirect(url_for('creator_lookup', orcid=clean_orcid(query)))  | 
