summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:20:34 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:20:34 -0800
commit59912583926077260d99a9bf77a938c2215eb6c8 (patch)
tree8dc9691d43e40d9c188448fd8b683f6c252d2d05
parent4cbee44529dd967c966ed3f2cc2bb80176be4e43 (diff)
downloadfatcat-59912583926077260d99a9bf77a938c2215eb6c8.tar.gz
fatcat-59912583926077260d99a9bf77a938c2215eb6c8.zip
tweak file ES archive.org domain tracking
-rw-r--r--extra/elasticsearch/file_schema.json1
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py6
2 files changed, 7 insertions, 0 deletions
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index 2a7e5be0..a0ac3346 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -31,6 +31,7 @@
"hosts": { "type": "keyword" },
"rels": { "type": "keyword" },
"in_ia": { "type": "boolean" },
+ "in_ia_petabox": { "type": "boolean" },
"release_id": { "type": "alias", "path": "release_ids" },
"sha1hex": { "type": "alias", "path": "sha1" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index e1980d90..9aa3cece 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -505,5 +505,11 @@ def file_to_elasticsearch(entity):
t['rels'] = list(set([u.rel for u in entity.urls]))
t['in_ia'] = bool('archive.org' in t['domains'])
+ t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+
+ # ok, but actually remove archive.org hosts, because they make other
+ # aggregations hard and are a waste of storage
+ t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
+ t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
return t