diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-26 15:35:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-26 15:35:25 -0700 |
commit | ac4db073e0305e864778a306309f073d94524886 (patch) | |
tree | 1d1861052c2bdbea01224781b82dc8b720eac50a /extra/journal_metadata | |
parent | 814cdae01fa0f231792552543904db2888cacc41 (diff) | |
download | fatcat-ac4db073e0305e864778a306309f073d94524886.tar.gz fatcat-ac4db073e0305e864778a306309f073d94524886.zip |
chocula: more host/domain fixes
Diffstat (limited to 'extra/journal_metadata')
-rwxr-xr-x | extra/journal_metadata/chocula.py | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/extra/journal_metadata/chocula.py b/extra/journal_metadata/chocula.py index 37d4c8b3..50dff49f 100755 --- a/extra/journal_metadata/chocula.py +++ b/extra/journal_metadata/chocula.py @@ -270,10 +270,12 @@ def parse_url(url): url = str(urlcanon.semantic_precise(url)) url_surt = surt.surt(url) tld = tldextract.extract(url) - domain = '.'.join(tld[:]) + host = '.'.join(tld) + if host.startswith('.'): + host = host[1:] return dict(url=url, url_surt=url_surt, - host='.'.join(tld), + host=host, registered_domain=tld.registered_domain, suffix=tld.suffix) @@ -283,6 +285,9 @@ def test_parse_url(): assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk' assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk' + assert parse_url("google.com")['suffix'] == 'com' + assert parse_url("google.com")['host'] == 'google.com' + assert parse_url("mailto:bnewbold@bogus.com") == None assert parse_url("thing.com")['url'] == 'http://thing.com/' assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/' @@ -385,7 +390,7 @@ class ChoculaDatabase(): self.c.execute("INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)", (issnl, meta['url_surt'], meta['url'], meta['host'], - meta['registered_domain'], meta['suffix'].suffix)) + meta['registered_domain'], meta['suffix'])) def index_entrez(self, args): path = args.input_file or ENTREZ_FILE |