diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:20:34 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-30 00:20:34 -0800 |
commit | 59912583926077260d99a9bf77a938c2215eb6c8 (patch) | |
tree | 8dc9691d43e40d9c188448fd8b683f6c252d2d05 | |
parent | 4cbee44529dd967c966ed3f2cc2bb80176be4e43 (diff) | |
download | fatcat-59912583926077260d99a9bf77a938c2215eb6c8.tar.gz fatcat-59912583926077260d99a9bf77a938c2215eb6c8.zip |
tweak file ES archive.org domain tracking
-rw-r--r-- | extra/elasticsearch/file_schema.json | 1 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 6 |
2 files changed, 7 insertions, 0 deletions
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 2a7e5be0..a0ac3346 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -31,6 +31,7 @@ "hosts": { "type": "keyword" }, "rels": { "type": "keyword" }, "in_ia": { "type": "boolean" }, + "in_ia_petabox": { "type": "boolean" }, "release_id": { "type": "alias", "path": "release_ids" }, "sha1hex": { "type": "alias", "path": "sha1" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e1980d90..9aa3cece 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -505,5 +505,11 @@ def file_to_elasticsearch(entity): t['rels'] = list(set([u.rel for u in entity.urls])) t['in_ia'] = bool('archive.org' in t['domains']) + t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + + # ok, but actually remove archive.org hosts, because they make other + # aggregations hard and are a waste of storage + t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] + t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] return t |