update elastic schema and transform

author: Bryan Newbold <bnewbold@robocracy.org> 2018-09-22 17:30:21 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-09-22 17:30:21 -0700
commit: 2eedc871a71e83b126f98ca5915a463a8cd50ccc (patch)
tree: 4864e3aeca4673ff2bbea53e157d9de853e33e98
parent: 526fe297375b8e5efaffdcb936e6d1f0217d5b1a (diff)
download: fatcat-2eedc871a71e83b126f98ca5915a463a8cd50ccc.tar.gz
fatcat-2eedc871a71e83b126f98ca5915a463a8cd50ccc.zip
3 files changed, 86 insertions, 32 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index b9800143..0d205903 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -25,8 +25,49 @@ relation is *removed*. For example, if a file match against a given release is
 removed, the old release elastic object needs to be updated to remove the file
 from it's `files`.
 
-## TODO
+## Loading Data
+
+Drop and rebuild the schema:
+
+    http delete :9200/fatcat
+    http put :9200/fatcat < release_schema.json
+
+Put a single object (good for debugging):
+
+    head -n1 examples.json | http post :9200/fatcat/release/0
+    http get :9200/fatcat/release/0
+
+Bulk insert from a file on disk:
+
+    esbulk -verbose -id ident -index fatcat -type release examples.json
 
-"enum" types, distinct from "keyword"?
+Or, in a bulk production live-stream conversion:
+
+    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat-releases -type release
+
+## Full-Text Querying
+
+A generic full-text "query string" query look like this (replace "blood" with
+actual query string, and "size" field with the max results to return):
+
+    GET /fatcat/release/_search
+    {
+      "query": {
+        "query_string": {
+          "query": "blood",
+          "analyzer": "textIcuSearch",
+          "default_operator": "AND",
+          "analyze_wildcard": true,
+          "lenient": true,
+          "fields": ["title^3", "contrib_names^3", "container_title"]
+        }
+      },
+      "size": 3
+    }
+
+In the results take `.hits.hits[]._source` as the objects; `.hits.total` is the
+total number of search hits.
+
+## TODO
 
-Other identifiers in search index? core, wikidata
+- file URL domains? seems heavy
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 89359de4..22177c42 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -25,35 +25,43 @@
     }
 },
 "mappings": {
-    "work": {
-        "_all": { "enabled": true },
+    "release": {
         "properties": {
-            "ident":          { "type": "keyword", "include_in_all": false },
-            "revision":       { "type": "keyword", "include_in_all": false },
+            "ident":          { "type": "keyword" },
+            "revision":       { "type": "keyword" },
             "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "author_names":   { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "author":         { "type": "alias", "path": "contrib_names" },
+            "journal":        { "type": "alias", "path": "container_name" },
+            "date":           { "type": "alias", "path": "release_date" },
+            "issn":           { "type": "alias", "path": "container_issnl" },
+            "oa":             { "type": "alias", "path": "container_is_oa" },
+            "kept":           { "type": "alias", "path": "container_is_kept" },
+            "longtail":       { "type": "alias", "path": "container_is_longtail_oa" },
             "release_date":   { "type": "date" },
-            "release_type":   { "type": "keyword", "include_in_all": false },
-            "release_status": { "type": "keyword", "include_in_all": false },
-            "language": { "type": "keyword", "include_in_all": false },
+            "release_type":   { "type": "keyword" },
+            "release_status": { "type": "keyword" },
+            "language": { "type": "keyword" },
             "doi":      { "type": "keyword" },
             "pmid":     { "type": "keyword" },
             "pmcid":    { "type": "keyword" },
             "isbn13":   { "type": "keyword" },
-            "core_id":      { "type": "keyword", "include_in_all": false },
-            "wikidata_qid": { "type": "keyword", "include_in_all": false },
-            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false },
-            "container_title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "container_issnl":          { "type": "keyword", "include_in_all": false },
-            "container_is_oa":          { "type": "boolean", "include_in_all": false },
-            "container_is_kept":        { "type": "boolean", "include_in_all": false },
-            "container_is_longtail_oa": { "type": "booloean", "include_in_all": false },
-            "file_count":           { "type": "number", "include_in_all": false },
-            "file_pdf_url":         { "type": "keyword", "include_in_all": false },
-            "file_in_webarchive":   { "type": "boolean", "include_in_all": false },
-            "file_in_ia":           { "type": "boolean", "include_in_all": false },
-            "any_abstract":         { "type": "boolean", "include_in_all": false },
-            "in_shadow":            { "type": "boolean", "include_in_all": false }
+            "core_id":      { "type": "keyword" },
+            "wikidata_qid": { "type": "keyword" },
+            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "container_name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "container_issnl":          { "type": "keyword" },
+            "container_is_oa":          { "type": "boolean" },
+            "container_is_longtail_oa": { "type": "boolean" },
+            "contrib_count":        { "type": "integer" },
+            "contrib_names":  { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "ref_count":            { "type": "integer" },
+            "file_count":           { "type": "integer" },
+            "file_pdf_url":         { "type": "keyword" },
+            "file_in_webarchive":   { "type": "boolean" },
+            "file_in_ia":           { "type": "boolean" },
+            "any_abstract":         { "type": "boolean" },
+            "is_kept":              { "type": "boolean" },
+            "in_shadow":            { "type": "boolean" }
         }
     }
 }
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py
index 30449e18..2f67977e 100755
--- a/extra/elasticsearch/transform_release.py
+++ b/extra/elasticsearch/transform_release.py
@@ -26,18 +26,19 @@ def transform(m):
     )
 
     container = m.get('container')
+    container_is_kept = False
     if container:
         t['publisher'] = countainer.get('publisher')
-        t['container_title'] = countainer.get('title')
+        t['container_name'] = countainer.get('name')
         t['container_issnl'] = countainer.get('issnl')
         container_extra = container.get('extra')
         if container_extra:
             t['container_is_oa'] = container_extra.get('is_oa')
-            t['container_is_kept'] = container_extra.get('is_kept')
+            container_is_kept = container_extra.get('is_kept', False)
             t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
     else:
         t['publisher'] = m.get('publisher')
-        t['container_title'] = m.get('container_title')
+        t['container_name'] = m.get('container_name')
 
     files = m.get('files', [])
     t['file_count'] = len(files)
@@ -62,11 +63,15 @@ def transform(m):
     if extra:
         t['in_shadow'] = extra.get('in_shadow')
     t['any_abstract'] = bool(t.get('abstracts'))
+    t['is_kept'] = container_is_kept or extra.get('is_kept', False)
 
-    author_names = []
-    for contrib in m.get('contribs', []):
-        if contrib.get('raw_name'):
-            author_names.append(contrib.get('raw_name'))
+    t['ref_count'] = len(m.get('refs', []))
+    t['contrib_count'] = len(m.get('contribs', []))
+    contrib_names = []
+    for c in m.get('contribs', []):
+        if c.get('raw_name'):
+            contrib_names.append(c.get('raw_name'))
+    t['contrib_names'] = contrib_names
     return t
 
 def run():
author	Bryan Newbold <bnewbold@robocracy.org>	2018-09-22 17:30:21 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-09-22 17:30:21 -0700
commit	2eedc871a71e83b126f98ca5915a463a8cd50ccc (patch)
tree	4864e3aeca4673ff2bbea53e157d9de853e33e98
parent	526fe297375b8e5efaffdcb936e6d1f0217d5b1a (diff)
download	fatcat-2eedc871a71e83b126f98ca5915a463a8cd50ccc.tar.gz fatcat-2eedc871a71e83b126f98ca5915a463a8cd50ccc.zip