From 55ebdd8f06e73ef904e8c216a6e8a67f7fded8d2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Jun 2019 10:43:54 -0700 Subject: start work on 'generic' search box --- python/fatcat_tools/normal.py | 95 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 python/fatcat_tools/normal.py (limited to 'python/fatcat_tools/normal.py') diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py new file mode 100644 index 00000000..044ab87d --- /dev/null +++ b/python/fatcat_tools/normal.py @@ -0,0 +1,95 @@ + +""" +A bunch of helpers to parse and normalize strings: external identifiers, +free-form input, titles, etc. +""" + +import re + + +def clean_doi(raw): + """ + Removes any: + - padding whitespace + - 'doi:' prefix + - URL prefix + + Does not try to un-URL-encode + + Returns None if not a valid DOI + """ + raw = raw.strip() + if len(raw.split()) != 1: + return None + if raw.startswith("doi:"): + raw = raw[4:] + if raw.startswith("http://"): + raw = raw[7:] + if raw.startswith("https://"): + raw = raw[8:] + if raw.startswith("doi.org/"): + raw = raw[8:] + if raw.startswith("dx.doi.org/"): + raw = raw[11:] + if not raw.startswith("10."): + return None + # TODO: actual regex + return raw + +def test_clean_doi(): + assert clean_doi("10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("doi:10.1234/ asdf ") == None + +def clean_arxiv_id(raw): + """ + Removes any: + - 'arxiv:' prefix + + Works with versioned or un-versioned arxiv identifiers. + """ + pass + +def test_clean_arxiv_id(): + pass + +def clean_pmcid(raw): + raw = raw.strip() + if len(raw.split()) != 1: + return None + if raw.startswith("PMC") and raw[3:] and raw[3:].isdigit(): + return raw + return None + +def clean_sha1(raw): + raw = raw.strip() + if len(raw.split()) != 1: + return None + pass + +def clean_issn(raw): + raw = raw.strip() + if len(raw.split()) != 1: + return None + if len(raw) == 9 and raw[4] == "-" and raw[0:4].isdigit(): + return raw + return None + +def test_clean_issn(): + assert clean_issn("1234-4567") == "1234-4567" + assert clean_issn("134-4567") == None + assert clean_issn("123X-4567") == None + +def clean_isbn13(raw): + raw = raw.strip() + if len(raw.split()) != 1: + return None + return None + +def clean_orcid(raw): + raw = raw.strip() + if len(raw.split()) != 1: + return None + return None -- cgit v1.2.3