From a42d5f0d00e76bf8474647fae4e1d9d61693a7d9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 4 Jun 2020 14:01:34 -0700
Subject: ES schema: add best_url to file schema

This will increase index size (URLs are often long in our corpus, and we
have many file entities), but seems worth it.

Initially added `ia_url` as a second field, guaranteed to always be an
*.archive.org URL, but `best_url` defaults to that anyways so didn't
seem worthwhile.
---
 python/fatcat_tools/transforms/elasticsearch.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'python')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 87e054ec..1d35141b 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -508,4 +508,16 @@ def file_to_elasticsearch(entity):
     t['in_ia'] = bool('archive.org' in t['domains'])
     t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
 
+    any_url = None
+    good_url = None
+    best_url = None
+    for release_url in (entity.urls or []):
+        any_url = release_url.url
+        if release_url.rel in ('webarchive', 'repository'):
+            good_url = release_url.url
+        if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+            best_url = release_url.url
+    # here is where we bake-in priority; IA-specific
+    t['best_url'] = best_url or good_url or any_url
+
     return t
-- 
cgit v1.2.3