summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:20:34 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:20:34 -0800
commit59912583926077260d99a9bf77a938c2215eb6c8 (patch)
tree8dc9691d43e40d9c188448fd8b683f6c252d2d05 /python/fatcat_tools
parent4cbee44529dd967c966ed3f2cc2bb80176be4e43 (diff)
downloadfatcat-59912583926077260d99a9bf77a938c2215eb6c8.tar.gz
fatcat-59912583926077260d99a9bf77a938c2215eb6c8.zip
tweak file ES archive.org domain tracking
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index e1980d90..9aa3cece 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -505,5 +505,11 @@ def file_to_elasticsearch(entity):
t['rels'] = list(set([u.rel for u in entity.urls]))
t['in_ia'] = bool('archive.org' in t['domains'])
+ t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+
+ # ok, but actually remove archive.org hosts, because they make other
+ # aggregations hard and are a waste of storage
+ t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
+ t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
return t