diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/utils.py | 238 | ||||
-rw-r--r-- | python/tests/test_utils.py | 50 |
2 files changed, 0 insertions, 288 deletions
diff --git a/python/refcat/utils.py b/python/refcat/utils.py index 2fbaec5..a665a25 100644 --- a/python/refcat/utils.py +++ b/python/refcat/utils.py @@ -2,245 +2,7 @@ Assorted utilities. """ -import collections import io -import re - -DOI_PATTERN = re.compile(r"10.[0-9]{1,6}/[^ ]*[\w]") - -# via: https://gist.github.com/gruber/8891611 -URL_PATTERN = re.compile(r"""(?xi) -\b -( # Capture 1: entire matched URL - (?: - https?: # URL protocol and colon - (?: - /{1,3} # 1-3 slashes - | # or - [a-z0-9%] # Single letter or digit or '%' - # (Trying not to match e.g. "URI::Escape") - ) - | # or - # looks like domain name followed by a slash: - [a-z0-9.\-]+[.] - (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw) - / - ) - (?: # One or more: - [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[] - | # or - \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…) - | - \([^\s]+?\) # balanced parens, non-recursive: (…) - )+ - (?: # End with: - \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…) - | - \([^\s]+?\) # balanced parens, non-recursive: (…) - | # or - [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars - ) - | # OR, the following to match naked domains: - (?: - (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_ - [a-z0-9]+ - (?:[.\-][a-z0-9]+)* - [.] - (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw) - \b - /? - (?!@) # not succeeded by a @, avoid matching "foo.na" in "foo.na@example.com" - ) -) -""") - - -def derive_from_ref(doc): - """ - Given a ref document, return an extended document. - - * biblio might have doi, or url with doi - - Sometimes the doi is not the doi, e.g. - - ... "doi": "https:/www.atsjournals.org/doi/10.1513/AnnalsATS.201602-127OC", - - Do less expensive lookups first: - - * is there a doi? we need a (sorted) doi release ident table for quick - lookups, e.g. shelve - * is there some doi hidden in a url or elsewhere? - * is there a QID? - * derive a number of keys (from fuzzycat) from title and authors - * find the key in a prepared key-value store or do some binary search over a sorted cache file - * run verification between a pseudo-release and a release document - * run verification between a pseudo-release and a release document - - """ - raise NotImplementedError - - -def extract_urls(s): - return URL_PATTERN.findall(s) - - -def extract_dois(s): - return DOI_PATTERN.findall(s) - - -def cleanup_url(url): - """ - Given a URL string, check if it yields a 200. - - http://www.ces.clemson.edu/~ahoover/stare/[14 - http://rsbweb.nih.gov/ij/(accessed - http://www.construction-chanvre.asso.fr. - http://dx - USDA/RM/SARH/INIFAPGeneral - ... - - """ - raise NotImplementedError - - -def ref_to_release(ref, require_title=False): - """ - Turn ref into a release. - - { - "biblio": { - "container_name": "Leisure in Contemporary Society", - "contrib_raw_names": [ - "K Roberts" - ], - "unstructured": "Roberts, K. (1999) Leisure in Contemporary Society. Oxford: CABI Publishing.", - "year": 1999 - }, - "index": 10, - "key": "8_CR11", - "ref_source": "crossref", - "release_ident": "k7jnyix375blxffr4llcc7gfaa", - "release_year": 2007, - "work_ident": "aaaonixgi5b6ziulq4odznf5fa" - } - - Sample (1M) keys: - - { - "has_any_extid": 393409, - "has_container_volume_issue_pages": 58133, - "has_title_contrib_year": 467613, - "has_contrib_container_year": 562049, - "has_title_container_year": 357526, - "total": 1000000, - "has_key": 917486, - "source_grobid": 457224, - "has_release_ident": 1000000, - "has_release_year": 965173, - "has_work_ident": 1000000, - "has_container_name": 590519, - "has_contrib_raw_names": 729729, - "has_issue": 77384, - "has_pages": 339494, - "has_title": 524288, - "has_unstructured": 651102, - "has_volume": 533352, - "has_year": 721823, - "has_index": 969778, - "has_publisher": 40750, - "source_crossref": 425789, - "has_doi": 344885, - "has_locator": 246106, - "has_url": 25284, - "source_datacite": 34855, - "source_pubmed": 66027, - "has_target_release_id": 44076, - "has_pmid": 49144, - "source_fatcat": 16105, - "has_arxiv_id": 2018, - "has_pmcid": 45 - } - - Report from indigo (10M): - - { - "biblio": 10000000, - "biblio.arxiv_id": 23227, - "biblio.container_name": 5760760, - "biblio.contrib_raw_names": 7156385, - "biblio.doi": 3584451, - "biblio.issue": 763784, - "biblio.pages": 3331911, - "biblio.pmcid": 776, - "biblio.pmid": 471338, - "biblio.publisher": 398305, - "biblio.title": 5164864, - "biblio.unstructured": 6304402, - "biblio.url": 256771, - "biblio.volume": 5202508, - "biblio.year": 7055442, - "index": 10000000, - "key": 8986307, - "locator": 2390436, - "ref_source": 10000000, - "release_ident": 10000000, - "release_year": 9629380, - "target_release_id": 419033, - "work_ident": 10000000 - } - - TODO: reduce footprint in the "_" meta section. - - """ - if not "biblio" in ref: - return None - - biblio = ref["biblio"] - result = collections.defaultdict(collections.defaultdict) - - # we need this for clustering - result["ident"] = ref["release_ident"] - - result["_"]["release_ident"] = ref["release_ident"] - result["_"]["work_ident"] = ref["work_ident"] - if "target_release_id" in ref: - # via pmid? - result["_"]["target_release_id"] = ref["target_release_id"] - - if "arxiv_id" in biblio: - result["ext_ids"]["arxiv"] = biblio["arxiv_id"] - if "doi" in biblio: - result["ext_ids"]["doi"] = biblio["doi"] - if "pmid" in biblio: - result["ext_ids"]["pmid"] = biblio["pmid"] - if "pmcid" in biblio: - result["ext_ids"]["pmcid"] = biblio["pmcid"] - - if "title" in biblio: - result["title"] = biblio["title"] - elif require_title: - if "ext_ids" in result: - return result - else: - return None - - if "publisher" in biblio: - result["publisher"] = biblio["publisher"] - if "container_name" in biblio: - result["container"]["name"] = biblio["container_name"] - if "volume" in biblio: - result["volume"] = biblio["volume"] - if "issue" in biblio: - result["issue"] = biblio["issue"] - if "pages" in biblio: - result["pages"] = biblio["pages"] - if "release_year" in biblio: - result["release_year"] = biblio["release_year"] - if "contrib_raw_names" in biblio: - result["contribs"] = [{"raw_name": name} for name in biblio["contrib_raw_names"]] - - return result - def columnize(lines, term_width=80, indent=0, pad=2): n_lines = len(lines) diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py deleted file mode 100644 index acc1888..0000000 --- a/python/tests/test_utils.py +++ /dev/null @@ -1,50 +0,0 @@ -from refcat.utils import extract_urls, extract_dois - - -def test_extract_urls(): - assert extract_urls("") == [] - assert extract_urls("abc") == [] - assert extract_urls("httP//abc") == [] - assert extract_urls("http//a.com") == ["a.com"] - assert extract_urls("http://a.com") == ["http://a.com"] - assert extract_urls("http://a.com/b") == ["http://a.com/b"] - assert extract_urls("https://a.com/b") == ["https://a.com/b"] - assert extract_urls("http=://a.com/b") == ["a.com/"] - assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == [ - "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/" - ] - assert extract_urls( - "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012" - ) == [] - assert extract_urls( - "http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en" - ) == ["http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"] - assert extract_urls("DOI:10.1093/forestry/cpr048") == [] - assert extract_urls("www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228") == [ - "www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228" - ] - assert extract_urls("http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"] - assert extract_urls("hello http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"] - assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == [ - "http://bit.ly/cJbkv", "http://bit.ly/cJbkv" - ] - assert extract_urls("jul./set.de") == ["set.de"] - - -def test_extract_doi(): - assert extract_dois("https://doi.org/10.1016/j.jsr.2003.05.009") == ["10.1016/j.jsr.2003.05.009"] - assert extract_dois("http://dx.doi.org/10.1002/elps.200500338") == ["10.1002/elps.200500338"] - - assert extract_dois("!!10.1016/j.chiabu.2013.09.002") == ['10.1016/j.chiabu.2013.09.002'] - assert extract_dois("!!10.1049/joe.2014.0134.!") == ["10.1049/joe.2014.0134"] - assert extract_dois("!!10.1080/00335630.2012.714899") == ["10.1080/00335630.2012.714899"] - assert extract_dois("!!10.1177/1075547007306508.!") == ["10.1177/1075547007306508"] - assert extract_dois("!!445!!10.3390/nu6114822") == ["10.3390/nu6114822"] - assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == [ - "10.1111/j.1467J9566.2010.01286" - ] - assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == [ - "10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO" - ] - assert extract_dois("!10.1002/ajpa.20674.!") == ["10.1002/ajpa.20674"] - assert extract_dois("!10.1002/chem.201700953.!") == ["10.1002/chem.201700953"] |