diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-06-04 14:01:34 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-06-04 14:12:30 -0700 | 
| commit | a42d5f0d00e76bf8474647fae4e1d9d61693a7d9 (patch) | |
| tree | f2556c2e40212da192517d0abd7c4f9e47e82cbb | |
| parent | 71e5662365892d32a5f92e2733b7ae804c833f57 (diff) | |
| download | fatcat-a42d5f0d00e76bf8474647fae4e1d9d61693a7d9.tar.gz fatcat-a42d5f0d00e76bf8474647fae4e1d9d61693a7d9.zip | |
ES schema: add best_url to file schema
This will increase index size (URLs are often long in our corpus, and we
have many file entities), but seems worth it.
Initially added `ia_url` as a second field, guaranteed to always be an
*.archive.org URL, but `best_url` defaults to that anyways so didn't
seem worthwhile.
| -rw-r--r-- | extra/elasticsearch/file_schema.json | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 12 | 
2 files changed, 13 insertions, 0 deletions
| diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 9c8ee64c..0fa25c3a 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -44,6 +44,7 @@              "rels":             { "type": "keyword", "normalizer": "default" },              "in_ia":            { "type": "boolean" },              "in_ia_petabox":    { "type": "boolean" }, +            "best_url":         { "type": "keyword", "normalizer": "default" },              "release_id":       { "type": "alias", "path": "release_ids" },              "sha1hex":          { "type": "alias", "path": "sha1" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 87e054ec..1d35141b 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -508,4 +508,16 @@ def file_to_elasticsearch(entity):      t['in_ia'] = bool('archive.org' in t['domains'])      t['in_ia_petabox'] = bool('archive.org' in t['hosts']) +    any_url = None +    good_url = None +    best_url = None +    for release_url in (entity.urls or []): +        any_url = release_url.url +        if release_url.rel in ('webarchive', 'repository'): +            good_url = release_url.url +        if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: +            best_url = release_url.url +    # here is where we bake-in priority; IA-specific +    t['best_url'] = best_url or good_url or any_url +      return t | 
