diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 6 | 
1 files changed, 6 insertions, 0 deletions
| diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e1980d90..9aa3cece 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -505,5 +505,11 @@ def file_to_elasticsearch(entity):      t['rels'] = list(set([u.rel for u in entity.urls]))      t['in_ia'] = bool('archive.org' in t['domains']) +    t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + +    # ok, but actually remove archive.org hosts, because they make other +    # aggregations hard and are a waste of storage +    t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] +    t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]      return t | 
