aboutsummaryrefslogtreecommitdiffstats
path: root/extra/journal_metadata/chocula.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-26 15:35:25 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-26 15:35:25 -0700
commitac4db073e0305e864778a306309f073d94524886 (patch)
tree1d1861052c2bdbea01224781b82dc8b720eac50a /extra/journal_metadata/chocula.py
parent814cdae01fa0f231792552543904db2888cacc41 (diff)
downloadfatcat-ac4db073e0305e864778a306309f073d94524886.tar.gz
fatcat-ac4db073e0305e864778a306309f073d94524886.zip
chocula: more host/domain fixes
Diffstat (limited to 'extra/journal_metadata/chocula.py')
-rwxr-xr-xextra/journal_metadata/chocula.py11
1 files changed, 8 insertions, 3 deletions
diff --git a/extra/journal_metadata/chocula.py b/extra/journal_metadata/chocula.py
index 37d4c8b3..50dff49f 100755
--- a/extra/journal_metadata/chocula.py
+++ b/extra/journal_metadata/chocula.py
@@ -270,10 +270,12 @@ def parse_url(url):
url = str(urlcanon.semantic_precise(url))
url_surt = surt.surt(url)
tld = tldextract.extract(url)
- domain = '.'.join(tld[:])
+ host = '.'.join(tld)
+ if host.startswith('.'):
+ host = host[1:]
return dict(url=url,
url_surt=url_surt,
- host='.'.join(tld),
+ host=host,
registered_domain=tld.registered_domain,
suffix=tld.suffix)
@@ -283,6 +285,9 @@ def test_parse_url():
assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
+ assert parse_url("google.com")['suffix'] == 'com'
+ assert parse_url("google.com")['host'] == 'google.com'
+
assert parse_url("mailto:bnewbold@bogus.com") == None
assert parse_url("thing.com")['url'] == 'http://thing.com/'
assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'
@@ -385,7 +390,7 @@ class ChoculaDatabase():
self.c.execute("INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)",
(issnl, meta['url_surt'], meta['url'], meta['host'],
- meta['registered_domain'], meta['suffix'].suffix))
+ meta['registered_domain'], meta['suffix']))
def index_entrez(self, args):
path = args.input_file or ENTREZ_FILE