From 83387210e6775751e5eb690a7d8b56fe99dbe380 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 7 Feb 2020 14:38:13 -0800 Subject: ES files: don't remove archive.org domains/hosts --- python/fatcat_tools/transforms/elasticsearch.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index f8bc05fb..e00d7830 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -509,9 +509,4 @@ def file_to_elasticsearch(entity): t['in_ia'] = bool('archive.org' in t['domains']) t['in_ia_petabox'] = bool('archive.org' in t['hosts']) - # ok, but actually remove archive.org hosts, because they make other - # aggregations hard and are a waste of storage - t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] - t['domains'] = [h for h in t['domains'] if h not in ('archive.org')] - return t -- cgit v1.2.3