diff options
-rw-r--r-- | extra/elasticsearch/file_schema.json | 1 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 12 |
2 files changed, 13 insertions, 0 deletions
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 9c8ee64c..0fa25c3a 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -44,6 +44,7 @@ "rels": { "type": "keyword", "normalizer": "default" }, "in_ia": { "type": "boolean" }, "in_ia_petabox": { "type": "boolean" }, + "best_url": { "type": "keyword", "normalizer": "default" }, "release_id": { "type": "alias", "path": "release_ids" }, "sha1hex": { "type": "alias", "path": "sha1" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 87e054ec..1d35141b 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -508,4 +508,16 @@ def file_to_elasticsearch(entity): t['in_ia'] = bool('archive.org' in t['domains']) t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + any_url = None + good_url = None + best_url = None + for release_url in (entity.urls or []): + any_url = release_url.url + if release_url.rel in ('webarchive', 'repository'): + good_url = release_url.url + if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: + best_url = release_url.url + # here is where we bake-in priority; IA-specific + t['best_url'] = best_url or good_url or any_url + return t |