aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-02-07 14:38:13 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-02-07 14:38:13 -0800
commit83387210e6775751e5eb690a7d8b56fe99dbe380 (patch)
tree7fff4ed06981731594f4282f704911dff59c4090
parent3655bbe6c539fdeccfbfaa19b6fc93a4859e0ca7 (diff)
downloadfatcat-83387210e6775751e5eb690a7d8b56fe99dbe380.tar.gz
fatcat-83387210e6775751e5eb690a7d8b56fe99dbe380.zip
ES files: don't remove archive.org domains/hosts
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py5
1 files changed, 0 insertions, 5 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index f8bc05fb..e00d7830 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -509,9 +509,4 @@ def file_to_elasticsearch(entity):
t['in_ia'] = bool('archive.org' in t['domains'])
t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
- # ok, but actually remove archive.org hosts, because they make other
- # aggregations hard and are a waste of storage
- t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
- t['domains'] = [h for h in t['domains'] if h not in ('archive.org')]
-
return t