aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms/elasticsearch.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-06-04 14:01:34 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-06-04 14:12:30 -0700
commita42d5f0d00e76bf8474647fae4e1d9d61693a7d9 (patch)
treef2556c2e40212da192517d0abd7c4f9e47e82cbb /python/fatcat_tools/transforms/elasticsearch.py
parent71e5662365892d32a5f92e2733b7ae804c833f57 (diff)
downloadfatcat-a42d5f0d00e76bf8474647fae4e1d9d61693a7d9.tar.gz
fatcat-a42d5f0d00e76bf8474647fae4e1d9d61693a7d9.zip
ES schema: add best_url to file schema
This will increase index size (URLs are often long in our corpus, and we have many file entities), but seems worth it. Initially added `ia_url` as a second field, guaranteed to always be an *.archive.org URL, but `best_url` defaults to that anyways so didn't seem worthwhile.
Diffstat (limited to 'python/fatcat_tools/transforms/elasticsearch.py')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py12
1 files changed, 12 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 87e054ec..1d35141b 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -508,4 +508,16 @@ def file_to_elasticsearch(entity):
t['in_ia'] = bool('archive.org' in t['domains'])
t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+ any_url = None
+ good_url = None
+ best_url = None
+ for release_url in (entity.urls or []):
+ any_url = release_url.url
+ if release_url.rel in ('webarchive', 'repository'):
+ good_url = release_url.url
+ if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+ best_url = release_url.url
+ # here is where we bake-in priority; IA-specific
+ t['best_url'] = best_url or good_url or any_url
+
return t