diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-26 15:24:27 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-26 15:24:27 -0700 |
commit | cc2cac6d21ce2806b9673e8fa9d9e233c857ff7c (patch) | |
tree | a9c30802e86b4abbb5b4ef59d982f47c46c5c89a | |
parent | ff3464fbd8f270a612acffff1c123b673d81b9bc (diff) | |
download | fatcat-cc2cac6d21ce2806b9673e8fa9d9e233c857ff7c.tar.gz fatcat-cc2cac6d21ce2806b9673e8fa9d9e233c857ff7c.zip |
chocula: fix domain parsing
-rwxr-xr-x | extra/journal_metadata/chocula.py | 57 |
1 files changed, 47 insertions, 10 deletions
diff --git a/extra/journal_metadata/chocula.py b/extra/journal_metadata/chocula.py index 1d50e850..45357391 100755 --- a/extra/journal_metadata/chocula.py +++ b/extra/journal_metadata/chocula.py @@ -262,6 +262,47 @@ def test_merge_spans(): [[1450, 1900], [2000, 2000]] +def parse_url(url): + """ + Parses/cleans URLs. + + Returns a dict with: + + url: str, cleaned/normalized URL + url_surt: str, "sortable url" (a web-archiving format) + host: str, full hostname + registered_domain: "primary domain", eg "google.com" or "thing.co.uk" + suffix: str, eg "com" or "co.uk" + + Returns None if url is really bad (not a URL). + """ + if not url or 'mailto:' in url.lower() or url in ('http://n/a', 'http://N/A'): + return None + if url.startswith('www.'): + url = "http://" + url + url.replace('Http://', 'http://') + + url = str(urlcanon.semantic_precise(url)) + url_surt = surt.surt(url) + tld = tldextract.extract(url) + domain = '.'.join(tld[:]) + return dict(url=url, + url_surt=url_surt, + host='.'.join(tld), + registered_domain=tld.registered_domain, + suffix=tld.suffix) + +def test_parse_url(): + + assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk' + assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk' + assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk' + + assert parse_url("mailto:bnewbold@bogus.com") == None + assert parse_url("thing.com")['url'] == 'http://thing.com/' + assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/' + + ################### Main Class class ChoculaDatabase(): @@ -351,19 +392,15 @@ class ChoculaDatabase(): return issnl, status def add_url(self, issnl, url): - if not (url and issnl) or 'mailto:' in url.lower() or url in ('http://n/a', 'http://N/A'): + if not (issnl and url): + return + meta = parse_url(url) + if not meta: return - if url.startswith('www.'): - url = "http://" + url - url.replace('Http://', 'http://') - - url = str(urlcanon.semantic_precise(url)) - url_surt = surt.surt(url) - tld = tldextract.extract(url) - domain = '.'.join(tld[:]) self.c.execute("INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)", - (issnl, url_surt, url, tld.domain, tld.registered_domain, tld.suffix)) + (issnl, meta['url_surt'], meta['url'], meta['host'], + meta['registered_domain'], meta['suffix'].suffix)) def index_entrez(self, args): path = args.input_file or ENTREZ_FILE |