diff options
Diffstat (limited to 'extra/elasticsearch')
| -rw-r--r-- | extra/elasticsearch/README.md | 32 | ||||
| -rw-r--r-- | extra/elasticsearch/release_schema.json | 60 | ||||
| -rwxr-xr-x | extra/elasticsearch/transform_release.py | 79 | 
3 files changed, 171 insertions, 0 deletions
| diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md new file mode 100644 index 00000000..b9800143 --- /dev/null +++ b/extra/elasticsearch/README.md @@ -0,0 +1,32 @@ + +# Elasticsearch Schemas and Pipeline Docs + +Eventually, we might end up with schemas for multiple entity types, and in +particular glom/merge releases under their work, but for now we just have a +release-oriented schema that pulls in collection and files metadata. + +Elasticsearch has at least two uses: user-facing search for entities, and +exploring aggregate numbes. + +The schema tries to stay close to the release entity type, but adds some extra +aggregated fields and flags. + +The simple batch update pipeline currently in use is to: + +- make a fresh "expanded" release entity dump (JSON) +- transform using `parallel` and a python script +- bulk import into elastic using `esbulk` + +In the future, it would be nice to have a script that "tails" the changelog for +edits and updates just those entities in the database. This is somewhat +non-trivial because the "expand" data requires more sophisticated cache +invalidation (entity updates), particularly in the case where an inter-entity +relation is *removed*. For example, if a file match against a given release is +removed, the old release elastic object needs to be updated to remove the file +from it's `files`. + +## TODO + +"enum" types, distinct from "keyword"? + +Other identifiers in search index? core, wikidata diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json new file mode 100644 index 00000000..89359de4 --- /dev/null +++ b/extra/elasticsearch/release_schema.json @@ -0,0 +1,60 @@ +{ +"settings": { +    "index": { +        "analysis": { +            "analyzer": { +                "default": { +                    "type": "custom", +                    "tokenizer": "standard", +                    "filter": [ "lowercase", "asciifolding" ] +                }, +                "textIcu": { +                    "type": "custom", +                    "tokenizer": "icu_tokenizer", +                    "char_filter": [ "icu_normalizer" ], +                    "filter": [ "icu_folding" ] +                }, +                "textIcuSearch": { +                    "type": "custom", +                    "tokenizer": "icu_tokenizer", +                    "char_filter": [ "icu_normalizer" ], +                    "filter": [ "icu_folding" ] +                } +            } +        } +    } +}, +"mappings": { +    "work": { +        "_all": { "enabled": true }, +        "properties": { +            "ident":          { "type": "keyword", "include_in_all": false }, +            "revision":       { "type": "keyword", "include_in_all": false }, +            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "author_names":   { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "release_date":   { "type": "date" }, +            "release_type":   { "type": "keyword", "include_in_all": false }, +            "release_status": { "type": "keyword", "include_in_all": false }, +            "language": { "type": "keyword", "include_in_all": false }, +            "doi":      { "type": "keyword" }, +            "pmid":     { "type": "keyword" }, +            "pmcid":    { "type": "keyword" }, +            "isbn13":   { "type": "keyword" }, +            "core_id":      { "type": "keyword", "include_in_all": false }, +            "wikidata_qid": { "type": "keyword", "include_in_all": false }, +            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false }, +            "container_title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "container_issnl":          { "type": "keyword", "include_in_all": false }, +            "container_is_oa":          { "type": "boolean", "include_in_all": false }, +            "container_is_kept":        { "type": "boolean", "include_in_all": false }, +            "container_is_longtail_oa": { "type": "booloean", "include_in_all": false }, +            "file_count":           { "type": "number", "include_in_all": false }, +            "file_pdf_url":         { "type": "keyword", "include_in_all": false }, +            "file_in_webarchive":   { "type": "boolean", "include_in_all": false }, +            "file_in_ia":           { "type": "boolean", "include_in_all": false }, +            "any_abstract":         { "type": "boolean", "include_in_all": false }, +            "in_shadow":            { "type": "boolean", "include_in_all": false } +        } +    } +} +} diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py new file mode 100755 index 00000000..30449e18 --- /dev/null +++ b/extra/elasticsearch/transform_release.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +import sys +import json + +def transform(m): + +    if m['state'] != 'active': +        return None + +    # First, the easy ones (direct copy) +    t = dict( +        ident = m['ident'], +        revision = m['revision'], +        title = m['title'], +        release_date = m.get('release_date'), +        release_type = m.get('release_type'), +        release_status = m.get('release_status'), +        language = m.get('language'), +        doi = m.get('doi'), +        pmid = m.get('pmid'), +        pmcid = m.get('pmcid'), +        isbn13 = m.get('isbn13'), +        core_id = m.get('core_id'), +        wikidata_qid = m.get('wikidata_qid') +    ) + +    container = m.get('container') +    if container: +        t['publisher'] = countainer.get('publisher') +        t['container_title'] = countainer.get('title') +        t['container_issnl'] = countainer.get('issnl') +        container_extra = container.get('extra') +        if container_extra: +            t['container_is_oa'] = container_extra.get('is_oa') +            t['container_is_kept'] = container_extra.get('is_kept') +            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') +    else: +        t['publisher'] = m.get('publisher') +        t['container_title'] = m.get('container_title') + +    files = m.get('files', []) +    t['file_count'] = len(files) +    in_wa = False +    in_ia = False +    t['file_pdf_url'] = None +    for f in files: +        is_pdf = 'pdf' in f.get('mimetype', '') +        for url in f.get('urls', []): +            if url.get('rel', '') == 'webarchive': +                in_wa = True +            if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: +                in_ia = True +                if is_pdf: +                    t['file_pdf_url'] = url['url'] +            if not t['file_pdf_url'] and is_pdf: +                t['file_pdf_url'] = url['url'] +    t['file_in_webarchive'] = in_wa +    t['file_in_ia'] = in_ia + +    extra = m.get('extra') +    if extra: +        t['in_shadow'] = extra.get('in_shadow') +    t['any_abstract'] = bool(t.get('abstracts')) + +    author_names = [] +    for contrib in m.get('contribs', []): +        if contrib.get('raw_name'): +            author_names.append(contrib.get('raw_name')) +    return t + +def run(): +    for line in sys.stdin: +        obj = transform(json.loads(line)) +        if obj: +            print(json.dumps(obj)) + +if __name__=="__main__": +    run() | 
