2 files changed, 5 insertions, 2 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index b5abe2ae..f8bc05fb 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -502,7 +502,7 @@ def file_to_elasticsearch(entity):
     )
 
     parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
-    t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+    t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls]))
     t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
     t['rels'] = list(set([u.rel for u in entity.urls]))
 
@@ -512,6 +512,6 @@ def file_to_elasticsearch(entity):
     # ok, but actually remove archive.org hosts, because they make other
     # aggregations hard and are a waste of storage
     t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
-    t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
+    t['domains'] = [h for h in t['domains'] if h not in ('archive.org')]
 
     return t
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index e67681c6..c94ab375 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -110,6 +110,9 @@ def test_elasticsearch_file_transform(matched_importer):
     assert 'web' in es['rels']
     assert 'www.zhros.ru' in es['hosts']
     assert 'zhros.ru' in es['domains']
+    assert not '.archive.org' in (es['hosts'] + es['domains'])
+    assert not 'archive.org' in (es['hosts'] + es['domains'])
+    assert not 'web.archive.org' in (es['hosts'] + es['domains'])
 
 def test_elasticsearch_changelog_transform(matched_importer):
     ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)