aboutsummaryrefslogtreecommitdiffstats
path: root/extra/elasticsearch
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-21 16:56:01 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-21 16:56:01 -0700
commit86d15bda26280437ac7a853e73d460d0bf9dd418 (patch)
treecfd8347bb1f4e98cdab67cebb4637421458673a9 /extra/elasticsearch
parentd495df1f76c44b7e09db2fb8b93615ffcdf6b818 (diff)
downloadfatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.tar.gz
fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.zip
first pass at a release elastic schema
Diffstat (limited to 'extra/elasticsearch')
-rw-r--r--extra/elasticsearch/README.md32
-rw-r--r--extra/elasticsearch/release_schema.json60
-rwxr-xr-xextra/elasticsearch/transform_release.py79
3 files changed, 171 insertions, 0 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
new file mode 100644
index 00000000..b9800143
--- /dev/null
+++ b/extra/elasticsearch/README.md
@@ -0,0 +1,32 @@
+
+# Elasticsearch Schemas and Pipeline Docs
+
+Eventually, we might end up with schemas for multiple entity types, and in
+particular glom/merge releases under their work, but for now we just have a
+release-oriented schema that pulls in collection and files metadata.
+
+Elasticsearch has at least two uses: user-facing search for entities, and
+exploring aggregate numbes.
+
+The schema tries to stay close to the release entity type, but adds some extra
+aggregated fields and flags.
+
+The simple batch update pipeline currently in use is to:
+
+- make a fresh "expanded" release entity dump (JSON)
+- transform using `parallel` and a python script
+- bulk import into elastic using `esbulk`
+
+In the future, it would be nice to have a script that "tails" the changelog for
+edits and updates just those entities in the database. This is somewhat
+non-trivial because the "expand" data requires more sophisticated cache
+invalidation (entity updates), particularly in the case where an inter-entity
+relation is *removed*. For example, if a file match against a given release is
+removed, the old release elastic object needs to be updated to remove the file
+from it's `files`.
+
+## TODO
+
+"enum" types, distinct from "keyword"?
+
+Other identifiers in search index? core, wikidata
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
new file mode 100644
index 00000000..89359de4
--- /dev/null
+++ b/extra/elasticsearch/release_schema.json
@@ -0,0 +1,60 @@
+{
+"settings": {
+ "index": {
+ "analysis": {
+ "analyzer": {
+ "default": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [ "lowercase", "asciifolding" ]
+ },
+ "textIcu": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding" ]
+ },
+ "textIcuSearch": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding" ]
+ }
+ }
+ }
+ }
+},
+"mappings": {
+ "work": {
+ "_all": { "enabled": true },
+ "properties": {
+ "ident": { "type": "keyword", "include_in_all": false },
+ "revision": { "type": "keyword", "include_in_all": false },
+ "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "author_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "release_date": { "type": "date" },
+ "release_type": { "type": "keyword", "include_in_all": false },
+ "release_status": { "type": "keyword", "include_in_all": false },
+ "language": { "type": "keyword", "include_in_all": false },
+ "doi": { "type": "keyword" },
+ "pmid": { "type": "keyword" },
+ "pmcid": { "type": "keyword" },
+ "isbn13": { "type": "keyword" },
+ "core_id": { "type": "keyword", "include_in_all": false },
+ "wikidata_qid": { "type": "keyword", "include_in_all": false },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false },
+ "container_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "container_issnl": { "type": "keyword", "include_in_all": false },
+ "container_is_oa": { "type": "boolean", "include_in_all": false },
+ "container_is_kept": { "type": "boolean", "include_in_all": false },
+ "container_is_longtail_oa": { "type": "booloean", "include_in_all": false },
+ "file_count": { "type": "number", "include_in_all": false },
+ "file_pdf_url": { "type": "keyword", "include_in_all": false },
+ "file_in_webarchive": { "type": "boolean", "include_in_all": false },
+ "file_in_ia": { "type": "boolean", "include_in_all": false },
+ "any_abstract": { "type": "boolean", "include_in_all": false },
+ "in_shadow": { "type": "boolean", "include_in_all": false }
+ }
+ }
+}
+}
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py
new file mode 100755
index 00000000..30449e18
--- /dev/null
+++ b/extra/elasticsearch/transform_release.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+def transform(m):
+
+ if m['state'] != 'active':
+ return None
+
+ # First, the easy ones (direct copy)
+ t = dict(
+ ident = m['ident'],
+ revision = m['revision'],
+ title = m['title'],
+ release_date = m.get('release_date'),
+ release_type = m.get('release_type'),
+ release_status = m.get('release_status'),
+ language = m.get('language'),
+ doi = m.get('doi'),
+ pmid = m.get('pmid'),
+ pmcid = m.get('pmcid'),
+ isbn13 = m.get('isbn13'),
+ core_id = m.get('core_id'),
+ wikidata_qid = m.get('wikidata_qid')
+ )
+
+ container = m.get('container')
+ if container:
+ t['publisher'] = countainer.get('publisher')
+ t['container_title'] = countainer.get('title')
+ t['container_issnl'] = countainer.get('issnl')
+ container_extra = container.get('extra')
+ if container_extra:
+ t['container_is_oa'] = container_extra.get('is_oa')
+ t['container_is_kept'] = container_extra.get('is_kept')
+ t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+ else:
+ t['publisher'] = m.get('publisher')
+ t['container_title'] = m.get('container_title')
+
+ files = m.get('files', [])
+ t['file_count'] = len(files)
+ in_wa = False
+ in_ia = False
+ t['file_pdf_url'] = None
+ for f in files:
+ is_pdf = 'pdf' in f.get('mimetype', '')
+ for url in f.get('urls', []):
+ if url.get('rel', '') == 'webarchive':
+ in_wa = True
+ if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
+ in_ia = True
+ if is_pdf:
+ t['file_pdf_url'] = url['url']
+ if not t['file_pdf_url'] and is_pdf:
+ t['file_pdf_url'] = url['url']
+ t['file_in_webarchive'] = in_wa
+ t['file_in_ia'] = in_ia
+
+ extra = m.get('extra')
+ if extra:
+ t['in_shadow'] = extra.get('in_shadow')
+ t['any_abstract'] = bool(t.get('abstracts'))
+
+ author_names = []
+ for contrib in m.get('contribs', []):
+ if contrib.get('raw_name'):
+ author_names.append(contrib.get('raw_name'))
+ return t
+
+def run():
+ for line in sys.stdin:
+ obj = transform(json.loads(line))
+ if obj:
+ print(json.dumps(obj))
+
+if __name__=="__main__":
+ run()