From 59912583926077260d99a9bf77a938c2215eb6c8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:20:34 -0800 Subject: tweak file ES archive.org domain tracking --- python/fatcat_tools/transforms/elasticsearch.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'python') diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e1980d90..9aa3cece 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -505,5 +505,11 @@ def file_to_elasticsearch(entity): t['rels'] = list(set([u.rel for u in entity.urls])) t['in_ia'] = bool('archive.org' in t['domains']) + t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + + # ok, but actually remove archive.org hosts, because they make other + # aggregations hard and are a waste of storage + t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] + t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] return t -- cgit v1.2.3