From 86d15bda26280437ac7a853e73d460d0bf9dd418 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 21 Sep 2018 16:56:01 -0700 Subject: first pass at a release elastic schema --- extra/elasticsearch/README.md | 32 +++++++++++++ extra/elasticsearch/release_schema.json | 60 ++++++++++++++++++++++++ extra/elasticsearch/transform_release.py | 79 ++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 extra/elasticsearch/README.md create mode 100644 extra/elasticsearch/release_schema.json create mode 100755 extra/elasticsearch/transform_release.py (limited to 'extra') diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md new file mode 100644 index 00000000..b9800143 --- /dev/null +++ b/extra/elasticsearch/README.md @@ -0,0 +1,32 @@ + +# Elasticsearch Schemas and Pipeline Docs + +Eventually, we might end up with schemas for multiple entity types, and in +particular glom/merge releases under their work, but for now we just have a +release-oriented schema that pulls in collection and files metadata. + +Elasticsearch has at least two uses: user-facing search for entities, and +exploring aggregate numbes. + +The schema tries to stay close to the release entity type, but adds some extra +aggregated fields and flags. + +The simple batch update pipeline currently in use is to: + +- make a fresh "expanded" release entity dump (JSON) +- transform using `parallel` and a python script +- bulk import into elastic using `esbulk` + +In the future, it would be nice to have a script that "tails" the changelog for +edits and updates just those entities in the database. This is somewhat +non-trivial because the "expand" data requires more sophisticated cache +invalidation (entity updates), particularly in the case where an inter-entity +relation is *removed*. For example, if a file match against a given release is +removed, the old release elastic object needs to be updated to remove the file +from it's `files`. + +## TODO + +"enum" types, distinct from "keyword"? + +Other identifiers in search index? core, wikidata diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json new file mode 100644 index 00000000..89359de4 --- /dev/null +++ b/extra/elasticsearch/release_schema.json @@ -0,0 +1,60 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + } + } + } + } +}, +"mappings": { + "work": { + "_all": { "enabled": true }, + "properties": { + "ident": { "type": "keyword", "include_in_all": false }, + "revision": { "type": "keyword", "include_in_all": false }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "author_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "release_date": { "type": "date" }, + "release_type": { "type": "keyword", "include_in_all": false }, + "release_status": { "type": "keyword", "include_in_all": false }, + "language": { "type": "keyword", "include_in_all": false }, + "doi": { "type": "keyword" }, + "pmid": { "type": "keyword" }, + "pmcid": { "type": "keyword" }, + "isbn13": { "type": "keyword" }, + "core_id": { "type": "keyword", "include_in_all": false }, + "wikidata_qid": { "type": "keyword", "include_in_all": false }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false }, + "container_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_issnl": { "type": "keyword", "include_in_all": false }, + "container_is_oa": { "type": "boolean", "include_in_all": false }, + "container_is_kept": { "type": "boolean", "include_in_all": false }, + "container_is_longtail_oa": { "type": "booloean", "include_in_all": false }, + "file_count": { "type": "number", "include_in_all": false }, + "file_pdf_url": { "type": "keyword", "include_in_all": false }, + "file_in_webarchive": { "type": "boolean", "include_in_all": false }, + "file_in_ia": { "type": "boolean", "include_in_all": false }, + "any_abstract": { "type": "boolean", "include_in_all": false }, + "in_shadow": { "type": "boolean", "include_in_all": false } + } + } +} +} diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py new file mode 100755 index 00000000..30449e18 --- /dev/null +++ b/extra/elasticsearch/transform_release.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +import sys +import json + +def transform(m): + + if m['state'] != 'active': + return None + + # First, the easy ones (direct copy) + t = dict( + ident = m['ident'], + revision = m['revision'], + title = m['title'], + release_date = m.get('release_date'), + release_type = m.get('release_type'), + release_status = m.get('release_status'), + language = m.get('language'), + doi = m.get('doi'), + pmid = m.get('pmid'), + pmcid = m.get('pmcid'), + isbn13 = m.get('isbn13'), + core_id = m.get('core_id'), + wikidata_qid = m.get('wikidata_qid') + ) + + container = m.get('container') + if container: + t['publisher'] = countainer.get('publisher') + t['container_title'] = countainer.get('title') + t['container_issnl'] = countainer.get('issnl') + container_extra = container.get('extra') + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + t['container_is_kept'] = container_extra.get('is_kept') + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = m.get('publisher') + t['container_title'] = m.get('container_title') + + files = m.get('files', []) + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = m.get('extra') + if extra: + t['in_shadow'] = extra.get('in_shadow') + t['any_abstract'] = bool(t.get('abstracts')) + + author_names = [] + for contrib in m.get('contribs', []): + if contrib.get('raw_name'): + author_names.append(contrib.get('raw_name')) + return t + +def run(): + for line in sys.stdin: + obj = transform(json.loads(line)) + if obj: + print(json.dumps(obj)) + +if __name__=="__main__": + run() -- cgit v1.2.3