summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-31 13:31:59 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-31 13:31:59 -0800
commit741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76 (patch)
tree7c380becdc0b2cd8e1e5b40af76e4f85b3fe0a1c /python/fatcat_tools/transforms
parent0d037d0d2f73b18014d8d98a06fa3f7bc2c9b794 (diff)
downloadfatcat-741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76.tar.gz
fatcat-741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76.zip
ES releases: host/domain fixes
Diffstat (limited to 'python/fatcat_tools/transforms')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index b5abe2ae..f8bc05fb 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -502,7 +502,7 @@ def file_to_elasticsearch(entity):
)
parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
- t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+ t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls]))
t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
t['rels'] = list(set([u.rel for u in entity.urls]))
@@ -512,6 +512,6 @@ def file_to_elasticsearch(entity):
# ok, but actually remove archive.org hosts, because they make other
# aggregations hard and are a waste of storage
t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
- t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
+ t['domains'] = [h for h in t['domains'] if h not in ('archive.org')]
return t