aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/elasticsearch/file_schema.json1
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py12
2 files changed, 13 insertions, 0 deletions
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index 9c8ee64c..0fa25c3a 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -44,6 +44,7 @@
"rels": { "type": "keyword", "normalizer": "default" },
"in_ia": { "type": "boolean" },
"in_ia_petabox": { "type": "boolean" },
+ "best_url": { "type": "keyword", "normalizer": "default" },
"release_id": { "type": "alias", "path": "release_ids" },
"sha1hex": { "type": "alias", "path": "sha1" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 87e054ec..1d35141b 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -508,4 +508,16 @@ def file_to_elasticsearch(entity):
t['in_ia'] = bool('archive.org' in t['domains'])
t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+ any_url = None
+ good_url = None
+ best_url = None
+ for release_url in (entity.urls or []):
+ any_url = release_url.url
+ if release_url.rel in ('webarchive', 'repository'):
+ good_url = release_url.url
+ if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+ best_url = release_url.url
+ # here is where we bake-in priority; IA-specific
+ t['best_url'] = best_url or good_url or any_url
+
return t