From 741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 13:31:59 -0800 Subject: ES releases: host/domain fixes --- python/fatcat_tools/transforms/elasticsearch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools/transforms') diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index b5abe2ae..f8bc05fb 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -502,7 +502,7 @@ def file_to_elasticsearch(entity): ) parsed_urls = [tldextract.extract(u.url) for u in entity.urls] - t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls])) t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) @@ -512,6 +512,6 @@ def file_to_elasticsearch(entity): # ok, but actually remove archive.org hosts, because they make other # aggregations hard and are a waste of storage t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] - t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] + t['domains'] = [h for h in t['domains'] if h not in ('archive.org')] return t -- cgit v1.2.3