From c979e8b7e8b6f2761039267ac8dc21cd68610c2f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 26 Jul 2019 15:24:27 -0700 Subject: chocula: fix domain parsing --- chocula.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/chocula.py b/chocula.py index 1d50e85..4535739 100755 --- a/chocula.py +++ b/chocula.py @@ -262,6 +262,47 @@ def test_merge_spans(): [[1450, 1900], [2000, 2000]] +def parse_url(url): + """ + Parses/cleans URLs. + + Returns a dict with: + + url: str, cleaned/normalized URL + url_surt: str, "sortable url" (a web-archiving format) + host: str, full hostname + registered_domain: "primary domain", eg "google.com" or "thing.co.uk" + suffix: str, eg "com" or "co.uk" + + Returns None if url is really bad (not a URL). + """ + if not url or 'mailto:' in url.lower() or url in ('http://n/a', 'http://N/A'): + return None + if url.startswith('www.'): + url = "http://" + url + url.replace('Http://', 'http://') + + url = str(urlcanon.semantic_precise(url)) + url_surt = surt.surt(url) + tld = tldextract.extract(url) + domain = '.'.join(tld[:]) + return dict(url=url, + url_surt=url_surt, + host='.'.join(tld), + registered_domain=tld.registered_domain, + suffix=tld.suffix) + +def test_parse_url(): + + assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk' + assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk' + assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk' + + assert parse_url("mailto:bnewbold@bogus.com") == None + assert parse_url("thing.com")['url'] == 'http://thing.com/' + assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/' + + ################### Main Class class ChoculaDatabase(): @@ -351,19 +392,15 @@ class ChoculaDatabase(): return issnl, status def add_url(self, issnl, url): - if not (url and issnl) or 'mailto:' in url.lower() or url in ('http://n/a', 'http://N/A'): + if not (issnl and url): + return + meta = parse_url(url) + if not meta: return - if url.startswith('www.'): - url = "http://" + url - url.replace('Http://', 'http://') - - url = str(urlcanon.semantic_precise(url)) - url_surt = surt.surt(url) - tld = tldextract.extract(url) - domain = '.'.join(tld[:]) self.c.execute("INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)", - (issnl, url_surt, url, tld.domain, tld.registered_domain, tld.suffix)) + (issnl, meta['url_surt'], meta['url'], meta['host'], + meta['registered_domain'], meta['suffix'].suffix)) def index_entrez(self, args): path = args.input_file or ENTREZ_FILE -- cgit v1.2.3