aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms/elasticsearch.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/transforms/elasticsearch.py')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index e1980d90..9aa3cece 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -505,5 +505,11 @@ def file_to_elasticsearch(entity):
t['rels'] = list(set([u.rel for u in entity.urls]))
t['in_ia'] = bool('archive.org' in t['domains'])
+ t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+
+ # ok, but actually remove archive.org hosts, because they make other
+ # aggregations hard and are a waste of storage
+ t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
+ t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
return t