first pass at a release elastic schema

author: Bryan Newbold <bnewbold@robocracy.org> 2018-09-21 16:56:01 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-09-21 16:56:01 -0700
commit: 86d15bda26280437ac7a853e73d460d0bf9dd418 (patch)
tree: cfd8347bb1f4e98cdab67cebb4637421458673a9 /extra/elasticsearch
parent: d495df1f76c44b7e09db2fb8b93615ffcdf6b818 (diff)
download: fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.tar.gz
fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.zip
3 files changed, 171 insertions, 0 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
new file mode 100644
index 00000000..b9800143
--- /dev/null
+++ b/extra/elasticsearch/README.md
@@ -0,0 +1,32 @@
+
+# Elasticsearch Schemas and Pipeline Docs
+
+Eventually, we might end up with schemas for multiple entity types, and in
+particular glom/merge releases under their work, but for now we just have a
+release-oriented schema that pulls in collection and files metadata.
+
+Elasticsearch has at least two uses: user-facing search for entities, and
+exploring aggregate numbes.
+
+The schema tries to stay close to the release entity type, but adds some extra
+aggregated fields and flags.
+
+The simple batch update pipeline currently in use is to:
+
+- make a fresh "expanded" release entity dump (JSON)
+- transform using `parallel` and a python script
+- bulk import into elastic using `esbulk`
+
+In the future, it would be nice to have a script that "tails" the changelog for
+edits and updates just those entities in the database. This is somewhat
+non-trivial because the "expand" data requires more sophisticated cache
+invalidation (entity updates), particularly in the case where an inter-entity
+relation is *removed*. For example, if a file match against a given release is
+removed, the old release elastic object needs to be updated to remove the file
+from it's `files`.
+
+## TODO
+
+"enum" types, distinct from "keyword"?
+
+Other identifiers in search index? core, wikidata
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
new file mode 100644
index 00000000..89359de4
--- /dev/null
+++ b/extra/elasticsearch/release_schema.json
@@ -0,0 +1,60 @@
+{
+"settings": {
+    "index": {
+        "analysis": {
+            "analyzer": {
+                "default": {
+                    "type": "custom",
+                    "tokenizer": "standard",
+                    "filter": [ "lowercase", "asciifolding" ]
+                },
+                "textIcu": {
+                    "type": "custom",
+                    "tokenizer": "icu_tokenizer",
+                    "char_filter": [ "icu_normalizer" ],
+                    "filter": [ "icu_folding" ]
+                },
+                "textIcuSearch": {
+                    "type": "custom",
+                    "tokenizer": "icu_tokenizer",
+                    "char_filter": [ "icu_normalizer" ],
+                    "filter": [ "icu_folding" ]
+                }
+            }
+        }
+    }
+},
+"mappings": {
+    "work": {
+        "_all": { "enabled": true },
+        "properties": {
+            "ident":          { "type": "keyword", "include_in_all": false },
+            "revision":       { "type": "keyword", "include_in_all": false },
+            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "author_names":   { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "release_date":   { "type": "date" },
+            "release_type":   { "type": "keyword", "include_in_all": false },
+            "release_status": { "type": "keyword", "include_in_all": false },
+            "language": { "type": "keyword", "include_in_all": false },
+            "doi":      { "type": "keyword" },
+            "pmid":     { "type": "keyword" },
+            "pmcid":    { "type": "keyword" },
+            "isbn13":   { "type": "keyword" },
+            "core_id":      { "type": "keyword", "include_in_all": false },
+            "wikidata_qid": { "type": "keyword", "include_in_all": false },
+            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false },
+            "container_title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "container_issnl":          { "type": "keyword", "include_in_all": false },
+            "container_is_oa":          { "type": "boolean", "include_in_all": false },
+            "container_is_kept":        { "type": "boolean", "include_in_all": false },
+            "container_is_longtail_oa": { "type": "booloean", "include_in_all": false },
+            "file_count":           { "type": "number", "include_in_all": false },
+            "file_pdf_url":         { "type": "keyword", "include_in_all": false },
+            "file_in_webarchive":   { "type": "boolean", "include_in_all": false },
+            "file_in_ia":           { "type": "boolean", "include_in_all": false },
+            "any_abstract":         { "type": "boolean", "include_in_all": false },
+            "in_shadow":            { "type": "boolean", "include_in_all": false }
+        }
+    }
+}
+}
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py
new file mode 100755
index 00000000..30449e18
--- /dev/null
+++ b/extra/elasticsearch/transform_release.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+def transform(m):
+
+    if m['state'] != 'active':
+        return None
+
+    # First, the easy ones (direct copy)
+    t = dict(
+        ident = m['ident'],
+        revision = m['revision'],
+        title = m['title'],
+        release_date = m.get('release_date'),
+        release_type = m.get('release_type'),
+        release_status = m.get('release_status'),
+        language = m.get('language'),
+        doi = m.get('doi'),
+        pmid = m.get('pmid'),
+        pmcid = m.get('pmcid'),
+        isbn13 = m.get('isbn13'),
+        core_id = m.get('core_id'),
+        wikidata_qid = m.get('wikidata_qid')
+    )
+
+    container = m.get('container')
+    if container:
+        t['publisher'] = countainer.get('publisher')
+        t['container_title'] = countainer.get('title')
+        t['container_issnl'] = countainer.get('issnl')
+        container_extra = container.get('extra')
+        if container_extra:
+            t['container_is_oa'] = container_extra.get('is_oa')
+            t['container_is_kept'] = container_extra.get('is_kept')
+            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+    else:
+        t['publisher'] = m.get('publisher')
+        t['container_title'] = m.get('container_title')
+
+    files = m.get('files', [])
+    t['file_count'] = len(files)
+    in_wa = False
+    in_ia = False
+    t['file_pdf_url'] = None
+    for f in files:
+        is_pdf = 'pdf' in f.get('mimetype', '')
+        for url in f.get('urls', []):
+            if url.get('rel', '') == 'webarchive':
+                in_wa = True
+            if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
+                in_ia = True
+                if is_pdf:
+                    t['file_pdf_url'] = url['url']
+            if not t['file_pdf_url'] and is_pdf:
+                t['file_pdf_url'] = url['url']
+    t['file_in_webarchive'] = in_wa
+    t['file_in_ia'] = in_ia
+
+    extra = m.get('extra')
+    if extra:
+        t['in_shadow'] = extra.get('in_shadow')
+    t['any_abstract'] = bool(t.get('abstracts'))
+
+    author_names = []
+    for contrib in m.get('contribs', []):
+        if contrib.get('raw_name'):
+            author_names.append(contrib.get('raw_name'))
+    return t
+
+def run():
+    for line in sys.stdin:
+        obj = transform(json.loads(line))
+        if obj:
+            print(json.dumps(obj))
+
+if __name__=="__main__":
+    run()
author	Bryan Newbold <bnewbold@robocracy.org>	2018-09-21 16:56:01 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-09-21 16:56:01 -0700
commit	86d15bda26280437ac7a853e73d460d0bf9dd418 (patch)
tree	cfd8347bb1f4e98cdab67cebb4637421458673a9 /extra/elasticsearch
parent	d495df1f76c44b7e09db2fb8b93615ffcdf6b818 (diff)
download	fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.tar.gz fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.zip