aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/refcat/utils.py238
-rw-r--r--python/tests/test_utils.py50
-rw-r--r--skate/set/set.go6
3 files changed, 3 insertions, 291 deletions
diff --git a/python/refcat/utils.py b/python/refcat/utils.py
index 2fbaec5..a665a25 100644
--- a/python/refcat/utils.py
+++ b/python/refcat/utils.py
@@ -2,245 +2,7 @@
Assorted utilities.
"""
-import collections
import io
-import re
-
-DOI_PATTERN = re.compile(r"10.[0-9]{1,6}/[^ ]*[\w]")
-
-# via: https://gist.github.com/gruber/8891611
-URL_PATTERN = re.compile(r"""(?xi)
-\b
-( # Capture 1: entire matched URL
- (?:
- https?: # URL protocol and colon
- (?:
- /{1,3} # 1-3 slashes
- | # or
- [a-z0-9%] # Single letter or digit or '%'
- # (Trying not to match e.g. "URI::Escape")
- )
- | # or
- # looks like domain name followed by a slash:
- [a-z0-9.\-]+[.]
- (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
- /
- )
- (?: # One or more:
- [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
- | # or
- \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
- |
- \([^\s]+?\) # balanced parens, non-recursive: (…)
- )+
- (?: # End with:
- \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
- |
- \([^\s]+?\) # balanced parens, non-recursive: (…)
- | # or
- [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
- )
- | # OR, the following to match naked domains:
- (?:
- (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
- [a-z0-9]+
- (?:[.\-][a-z0-9]+)*
- [.]
- (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
- \b
- /?
- (?!@) # not succeeded by a @, avoid matching "foo.na" in "foo.na@example.com"
- )
-)
-""")
-
-
-def derive_from_ref(doc):
- """
- Given a ref document, return an extended document.
-
- * biblio might have doi, or url with doi
-
- Sometimes the doi is not the doi, e.g.
-
- ... "doi": "https:/www.atsjournals.org/doi/10.1513/AnnalsATS.201602-127OC",
-
- Do less expensive lookups first:
-
- * is there a doi? we need a (sorted) doi release ident table for quick
- lookups, e.g. shelve
- * is there some doi hidden in a url or elsewhere?
- * is there a QID?
- * derive a number of keys (from fuzzycat) from title and authors
- * find the key in a prepared key-value store or do some binary search over a sorted cache file
- * run verification between a pseudo-release and a release document
- * run verification between a pseudo-release and a release document
-
- """
- raise NotImplementedError
-
-
-def extract_urls(s):
- return URL_PATTERN.findall(s)
-
-
-def extract_dois(s):
- return DOI_PATTERN.findall(s)
-
-
-def cleanup_url(url):
- """
- Given a URL string, check if it yields a 200.
-
- http://www.ces.clemson.edu/~ahoover/stare/[14
- http://rsbweb.nih.gov/ij/(accessed
- http://www.construction-chanvre.asso.fr.
- http://dx
- USDA/RM/SARH/INIFAPGeneral
- ...
-
- """
- raise NotImplementedError
-
-
-def ref_to_release(ref, require_title=False):
- """
- Turn ref into a release.
-
- {
- "biblio": {
- "container_name": "Leisure in Contemporary Society",
- "contrib_raw_names": [
- "K Roberts"
- ],
- "unstructured": "Roberts, K. (1999) Leisure in Contemporary Society. Oxford: CABI Publishing.",
- "year": 1999
- },
- "index": 10,
- "key": "8_CR11",
- "ref_source": "crossref",
- "release_ident": "k7jnyix375blxffr4llcc7gfaa",
- "release_year": 2007,
- "work_ident": "aaaonixgi5b6ziulq4odznf5fa"
- }
-
- Sample (1M) keys:
-
- {
- "has_any_extid": 393409,
- "has_container_volume_issue_pages": 58133,
- "has_title_contrib_year": 467613,
- "has_contrib_container_year": 562049,
- "has_title_container_year": 357526,
- "total": 1000000,
- "has_key": 917486,
- "source_grobid": 457224,
- "has_release_ident": 1000000,
- "has_release_year": 965173,
- "has_work_ident": 1000000,
- "has_container_name": 590519,
- "has_contrib_raw_names": 729729,
- "has_issue": 77384,
- "has_pages": 339494,
- "has_title": 524288,
- "has_unstructured": 651102,
- "has_volume": 533352,
- "has_year": 721823,
- "has_index": 969778,
- "has_publisher": 40750,
- "source_crossref": 425789,
- "has_doi": 344885,
- "has_locator": 246106,
- "has_url": 25284,
- "source_datacite": 34855,
- "source_pubmed": 66027,
- "has_target_release_id": 44076,
- "has_pmid": 49144,
- "source_fatcat": 16105,
- "has_arxiv_id": 2018,
- "has_pmcid": 45
- }
-
- Report from indigo (10M):
-
- {
- "biblio": 10000000,
- "biblio.arxiv_id": 23227,
- "biblio.container_name": 5760760,
- "biblio.contrib_raw_names": 7156385,
- "biblio.doi": 3584451,
- "biblio.issue": 763784,
- "biblio.pages": 3331911,
- "biblio.pmcid": 776,
- "biblio.pmid": 471338,
- "biblio.publisher": 398305,
- "biblio.title": 5164864,
- "biblio.unstructured": 6304402,
- "biblio.url": 256771,
- "biblio.volume": 5202508,
- "biblio.year": 7055442,
- "index": 10000000,
- "key": 8986307,
- "locator": 2390436,
- "ref_source": 10000000,
- "release_ident": 10000000,
- "release_year": 9629380,
- "target_release_id": 419033,
- "work_ident": 10000000
- }
-
- TODO: reduce footprint in the "_" meta section.
-
- """
- if not "biblio" in ref:
- return None
-
- biblio = ref["biblio"]
- result = collections.defaultdict(collections.defaultdict)
-
- # we need this for clustering
- result["ident"] = ref["release_ident"]
-
- result["_"]["release_ident"] = ref["release_ident"]
- result["_"]["work_ident"] = ref["work_ident"]
- if "target_release_id" in ref:
- # via pmid?
- result["_"]["target_release_id"] = ref["target_release_id"]
-
- if "arxiv_id" in biblio:
- result["ext_ids"]["arxiv"] = biblio["arxiv_id"]
- if "doi" in biblio:
- result["ext_ids"]["doi"] = biblio["doi"]
- if "pmid" in biblio:
- result["ext_ids"]["pmid"] = biblio["pmid"]
- if "pmcid" in biblio:
- result["ext_ids"]["pmcid"] = biblio["pmcid"]
-
- if "title" in biblio:
- result["title"] = biblio["title"]
- elif require_title:
- if "ext_ids" in result:
- return result
- else:
- return None
-
- if "publisher" in biblio:
- result["publisher"] = biblio["publisher"]
- if "container_name" in biblio:
- result["container"]["name"] = biblio["container_name"]
- if "volume" in biblio:
- result["volume"] = biblio["volume"]
- if "issue" in biblio:
- result["issue"] = biblio["issue"]
- if "pages" in biblio:
- result["pages"] = biblio["pages"]
- if "release_year" in biblio:
- result["release_year"] = biblio["release_year"]
- if "contrib_raw_names" in biblio:
- result["contribs"] = [{"raw_name": name} for name in biblio["contrib_raw_names"]]
-
- return result
-
def columnize(lines, term_width=80, indent=0, pad=2):
n_lines = len(lines)
diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py
deleted file mode 100644
index acc1888..0000000
--- a/python/tests/test_utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from refcat.utils import extract_urls, extract_dois
-
-
-def test_extract_urls():
- assert extract_urls("") == []
- assert extract_urls("abc") == []
- assert extract_urls("httP//abc") == []
- assert extract_urls("http//a.com") == ["a.com"]
- assert extract_urls("http://a.com") == ["http://a.com"]
- assert extract_urls("http://a.com/b") == ["http://a.com/b"]
- assert extract_urls("https://a.com/b") == ["https://a.com/b"]
- assert extract_urls("http=://a.com/b") == ["a.com/"]
- assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == [
- "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/"
- ]
- assert extract_urls(
- "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012"
- ) == []
- assert extract_urls(
- "http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"
- ) == ["http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"]
- assert extract_urls("DOI:10.1093/forestry/cpr048") == []
- assert extract_urls("www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228") == [
- "www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228"
- ]
- assert extract_urls("http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
- assert extract_urls("hello http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
- assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == [
- "http://bit.ly/cJbkv", "http://bit.ly/cJbkv"
- ]
- assert extract_urls("jul./set.de") == ["set.de"]
-
-
-def test_extract_doi():
- assert extract_dois("https://doi.org/10.1016/j.jsr.2003.05.009") == ["10.1016/j.jsr.2003.05.009"]
- assert extract_dois("http://dx.doi.org/10.1002/elps.200500338") == ["10.1002/elps.200500338"]
-
- assert extract_dois("!!10.1016/j.chiabu.2013.09.002") == ['10.1016/j.chiabu.2013.09.002']
- assert extract_dois("!!10.1049/joe.2014.0134.!") == ["10.1049/joe.2014.0134"]
- assert extract_dois("!!10.1080/00335630.2012.714899") == ["10.1080/00335630.2012.714899"]
- assert extract_dois("!!10.1177/1075547007306508.!") == ["10.1177/1075547007306508"]
- assert extract_dois("!!445!!10.3390/nu6114822") == ["10.3390/nu6114822"]
- assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == [
- "10.1111/j.1467J9566.2010.01286"
- ]
- assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == [
- "10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO"
- ]
- assert extract_dois("!10.1002/ajpa.20674.!") == ["10.1002/ajpa.20674"]
- assert extract_dois("!10.1002/chem.201700953.!") == ["10.1002/chem.201700953"]
diff --git a/skate/set/set.go b/skate/set/set.go
index b762cb8..7e06a1b 100644
--- a/skate/set/set.go
+++ b/skate/set/set.go
@@ -65,9 +65,9 @@ func (s Set) Contains(v string) bool {
// Intersection returns a new set containing all elements found in both sets.
func (s Set) Intersection(t Set) Set {
u := New()
- for _, v := range s.Slice() {
- if t.Contains(v) {
- u.Add(v)
+ for k := range s {
+ if t.Contains(k) {
+ u.Add(k)
}
}
return u