2 files changed, 0 insertions, 288 deletions
diff --git a/python/refcat/utils.py b/python/refcat/utils.py
index 2fbaec5..a665a25 100644
--- a/python/refcat/utils.py
+++ b/python/refcat/utils.py
@@ -2,245 +2,7 @@
 Assorted utilities.
 """
 
-import collections
 import io
-import re
-
-DOI_PATTERN = re.compile(r"10.[0-9]{1,6}/[^ ]*[\w]")
-
-# via: https://gist.github.com/gruber/8891611
-URL_PATTERN = re.compile(r"""(?xi)
-\b
-(					# Capture 1: entire matched URL
-  (?:
-    https?:				# URL protocol and colon
-    (?:
-      /{1,3}				# 1-3 slashes
-      |					#   or
-      [a-z0-9%]				# Single letter or digit or '%'
-					# (Trying not to match e.g. "URI::Escape")
-    )
-    |					#   or
-					# looks like domain name followed by a slash:
-    [a-z0-9.\-]+[.]
-    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
-    /
-  )
-  (?:					# One or more:
-    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
-    |					#   or
-    \([^\s()]*?\([^\s()]+\)[^\s()]*?\)  # balanced parens, one level deep: (…(…)…)
-    |
-    \([^\s]+?\)				# balanced parens, non-recursive: (…)
-  )+
-  (?:					# End with:
-    \([^\s()]*?\([^\s()]+\)[^\s()]*?\)  # balanced parens, one level deep: (…(…)…)
-    |
-    \([^\s]+?\)				# balanced parens, non-recursive: (…)
-    |					#   or
-    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
-  )
-  |					# OR, the following to match naked domains:
-  (?:
-	(?<!@)				# not preceded by a @, avoid matching foo@_gmail.com_
-    [a-z0-9]+
-    (?:[.\-][a-z0-9]+)*
-    [.]
-    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
-    \b
-    /?
-    (?!@)				# not succeeded by a @, avoid matching "foo.na" in "foo.na@example.com"
-  )
-)
-""")
-
-
-def derive_from_ref(doc):
-    """
-    Given a ref document, return an extended document.
-
-    * biblio might have doi, or url with doi
-
-    Sometimes the doi is not the doi, e.g.
-
-    ... "doi": "https:/www.atsjournals.org/doi/10.1513/AnnalsATS.201602-127OC",
-
-    Do less expensive lookups first:
-
-    * is there a doi? we need a (sorted) doi release ident table for quick
-    lookups, e.g. shelve
-    * is there some doi hidden in a url or elsewhere?
-    * is there a QID?
-    * derive a number of keys (from fuzzycat) from title and authors
-    * find the key in a prepared key-value store or do some binary search over a sorted cache file
-    * run verification between a pseudo-release and a release document
-    * run verification between a pseudo-release and a release document
-
-    """
-    raise NotImplementedError
-
-
-def extract_urls(s):
-    return URL_PATTERN.findall(s)
-
-
-def extract_dois(s):
-    return DOI_PATTERN.findall(s)
-
-
-def cleanup_url(url):
-    """
-    Given a URL string, check if it yields a 200.
-
-    http://www.ces.clemson.edu/~ahoover/stare/[14
-    http://rsbweb.nih.gov/ij/(accessed
-    http://www.construction-chanvre.asso.fr.
-    http://dx
-    USDA/RM/SARH/INIFAPGeneral
-    ...
-
-    """
-    raise NotImplementedError
-
-
-def ref_to_release(ref, require_title=False):
-    """
-    Turn ref into a release.
-
-    {
-      "biblio": {
-	"container_name": "Leisure in Contemporary Society",
-	"contrib_raw_names": [
-	  "K Roberts"
-	],
-	"unstructured": "Roberts, K. (1999) Leisure in Contemporary Society. Oxford: CABI Publishing.",
-	"year": 1999
-      },
-      "index": 10,
-      "key": "8_CR11",
-      "ref_source": "crossref",
-      "release_ident": "k7jnyix375blxffr4llcc7gfaa",
-      "release_year": 2007,
-      "work_ident": "aaaonixgi5b6ziulq4odznf5fa"
-    }
-
-    Sample (1M) keys:
-
-    {
-      "has_any_extid": 393409,
-      "has_container_volume_issue_pages": 58133,
-      "has_title_contrib_year": 467613,
-      "has_contrib_container_year": 562049,
-      "has_title_container_year": 357526,
-      "total": 1000000,
-      "has_key": 917486,
-      "source_grobid": 457224,
-      "has_release_ident": 1000000,
-      "has_release_year": 965173,
-      "has_work_ident": 1000000,
-      "has_container_name": 590519,
-      "has_contrib_raw_names": 729729,
-      "has_issue": 77384,
-      "has_pages": 339494,
-      "has_title": 524288,
-      "has_unstructured": 651102,
-      "has_volume": 533352,
-      "has_year": 721823,
-      "has_index": 969778,
-      "has_publisher": 40750,
-      "source_crossref": 425789,
-      "has_doi": 344885,
-      "has_locator": 246106,
-      "has_url": 25284,
-      "source_datacite": 34855,
-      "source_pubmed": 66027,
-      "has_target_release_id": 44076,
-      "has_pmid": 49144,
-      "source_fatcat": 16105,
-      "has_arxiv_id": 2018,
-      "has_pmcid": 45
-    }
-
-    Report from indigo (10M):
-
-    {
-      "biblio": 10000000,
-      "biblio.arxiv_id": 23227,
-      "biblio.container_name": 5760760,
-      "biblio.contrib_raw_names": 7156385,
-      "biblio.doi": 3584451,
-      "biblio.issue": 763784,
-      "biblio.pages": 3331911,
-      "biblio.pmcid": 776,
-      "biblio.pmid": 471338,
-      "biblio.publisher": 398305,
-      "biblio.title": 5164864,
-      "biblio.unstructured": 6304402,
-      "biblio.url": 256771,
-      "biblio.volume": 5202508,
-      "biblio.year": 7055442,
-      "index": 10000000,
-      "key": 8986307,
-      "locator": 2390436,
-      "ref_source": 10000000,
-      "release_ident": 10000000,
-      "release_year": 9629380,
-      "target_release_id": 419033,
-      "work_ident": 10000000
-    }
-
-    TODO: reduce footprint in the "_" meta section.
-
-    """
-    if not "biblio" in ref:
-        return None
-
-    biblio = ref["biblio"]
-    result = collections.defaultdict(collections.defaultdict)
-
-    # we need this for clustering
-    result["ident"] = ref["release_ident"]
-
-    result["_"]["release_ident"] = ref["release_ident"]
-    result["_"]["work_ident"] = ref["work_ident"]
-    if "target_release_id" in ref:
-        # via pmid?
-        result["_"]["target_release_id"] = ref["target_release_id"]
-
-    if "arxiv_id" in biblio:
-        result["ext_ids"]["arxiv"] = biblio["arxiv_id"]
-    if "doi" in biblio:
-        result["ext_ids"]["doi"] = biblio["doi"]
-    if "pmid" in biblio:
-        result["ext_ids"]["pmid"] = biblio["pmid"]
-    if "pmcid" in biblio:
-        result["ext_ids"]["pmcid"] = biblio["pmcid"]
-
-    if "title" in biblio:
-        result["title"] = biblio["title"]
-    elif require_title:
-        if "ext_ids" in result:
-            return result
-        else:
-            return None
-
-    if "publisher" in biblio:
-        result["publisher"] = biblio["publisher"]
-    if "container_name" in biblio:
-        result["container"]["name"] = biblio["container_name"]
-    if "volume" in biblio:
-        result["volume"] = biblio["volume"]
-    if "issue" in biblio:
-        result["issue"] = biblio["issue"]
-    if "pages" in biblio:
-        result["pages"] = biblio["pages"]
-    if "release_year" in biblio:
-        result["release_year"] = biblio["release_year"]
-    if "contrib_raw_names" in biblio:
-        result["contribs"] = [{"raw_name": name} for name in biblio["contrib_raw_names"]]
-
-    return result
-
 
 def columnize(lines, term_width=80, indent=0, pad=2):
     n_lines = len(lines)
diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py
deleted file mode 100644
index acc1888..0000000
--- a/python/tests/test_utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from refcat.utils import extract_urls, extract_dois
-
-
-def test_extract_urls():
-    assert extract_urls("") == []
-    assert extract_urls("abc") == []
-    assert extract_urls("httP//abc") == []
-    assert extract_urls("http//a.com") == ["a.com"]
-    assert extract_urls("http://a.com") == ["http://a.com"]
-    assert extract_urls("http://a.com/b") == ["http://a.com/b"]
-    assert extract_urls("https://a.com/b") == ["https://a.com/b"]
-    assert extract_urls("http=://a.com/b") == ["a.com/"]
-    assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == [
-        "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/"
-    ]
-    assert extract_urls(
-        "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012"
-    ) == []
-    assert extract_urls(
-        "http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"
-    ) == ["http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"]
-    assert extract_urls("DOI:10.1093/forestry/cpr048") == []
-    assert extract_urls("www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228") == [
-        "www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228"
-    ]
-    assert extract_urls("http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
-    assert extract_urls("hello http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
-    assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == [
-        "http://bit.ly/cJbkv", "http://bit.ly/cJbkv"
-    ]
-    assert extract_urls("jul./set.de") == ["set.de"]
-
-
-def test_extract_doi():
-    assert extract_dois("https://doi.org/10.1016/j.jsr.2003.05.009") == ["10.1016/j.jsr.2003.05.009"]
-    assert extract_dois("http://dx.doi.org/10.1002/elps.200500338") == ["10.1002/elps.200500338"]
-
-    assert extract_dois("!!10.1016/j.chiabu.2013.09.002") == ['10.1016/j.chiabu.2013.09.002']
-    assert extract_dois("!!10.1049/joe.2014.0134.!") == ["10.1049/joe.2014.0134"]
-    assert extract_dois("!!10.1080/00335630.2012.714899") == ["10.1080/00335630.2012.714899"]
-    assert extract_dois("!!10.1177/1075547007306508.!") == ["10.1177/1075547007306508"]
-    assert extract_dois("!!445!!10.3390/nu6114822") == ["10.3390/nu6114822"]
-    assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == [
-        "10.1111/j.1467J9566.2010.01286"
-    ]
-    assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == [
-        "10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO"
-    ]
-    assert extract_dois("!10.1002/ajpa.20674.!") == ["10.1002/ajpa.20674"]
-    assert extract_dois("!10.1002/chem.201700953.!") == ["10.1002/chem.201700953"]