aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-22 17:30:21 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-22 17:30:21 -0700
commit2eedc871a71e83b126f98ca5915a463a8cd50ccc (patch)
tree4864e3aeca4673ff2bbea53e157d9de853e33e98
parent526fe297375b8e5efaffdcb936e6d1f0217d5b1a (diff)
downloadfatcat-2eedc871a71e83b126f98ca5915a463a8cd50ccc.tar.gz
fatcat-2eedc871a71e83b126f98ca5915a463a8cd50ccc.zip
update elastic schema and transform
-rw-r--r--extra/elasticsearch/README.md47
-rw-r--r--extra/elasticsearch/release_schema.json52
-rwxr-xr-xextra/elasticsearch/transform_release.py19
3 files changed, 86 insertions, 32 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index b9800143..0d205903 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -25,8 +25,49 @@ relation is *removed*. For example, if a file match against a given release is
removed, the old release elastic object needs to be updated to remove the file
from it's `files`.
-## TODO
+## Loading Data
+
+Drop and rebuild the schema:
+
+ http delete :9200/fatcat
+ http put :9200/fatcat < release_schema.json
+
+Put a single object (good for debugging):
+
+ head -n1 examples.json | http post :9200/fatcat/release/0
+ http get :9200/fatcat/release/0
+
+Bulk insert from a file on disk:
+
+ esbulk -verbose -id ident -index fatcat -type release examples.json
-"enum" types, distinct from "keyword"?
+Or, in a bulk production live-stream conversion:
+
+ time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat-releases -type release
+
+## Full-Text Querying
+
+A generic full-text "query string" query look like this (replace "blood" with
+actual query string, and "size" field with the max results to return):
+
+ GET /fatcat/release/_search
+ {
+ "query": {
+ "query_string": {
+ "query": "blood",
+ "analyzer": "textIcuSearch",
+ "default_operator": "AND",
+ "analyze_wildcard": true,
+ "lenient": true,
+ "fields": ["title^3", "contrib_names^3", "container_title"]
+ }
+ },
+ "size": 3
+ }
+
+In the results take `.hits.hits[]._source` as the objects; `.hits.total` is the
+total number of search hits.
+
+## TODO
-Other identifiers in search index? core, wikidata
+- file URL domains? seems heavy
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 89359de4..22177c42 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -25,35 +25,43 @@
}
},
"mappings": {
- "work": {
- "_all": { "enabled": true },
+ "release": {
"properties": {
- "ident": { "type": "keyword", "include_in_all": false },
- "revision": { "type": "keyword", "include_in_all": false },
+ "ident": { "type": "keyword" },
+ "revision": { "type": "keyword" },
"title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "author_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "author": { "type": "alias", "path": "contrib_names" },
+ "journal": { "type": "alias", "path": "container_name" },
+ "date": { "type": "alias", "path": "release_date" },
+ "issn": { "type": "alias", "path": "container_issnl" },
+ "oa": { "type": "alias", "path": "container_is_oa" },
+ "kept": { "type": "alias", "path": "container_is_kept" },
+ "longtail": { "type": "alias", "path": "container_is_longtail_oa" },
"release_date": { "type": "date" },
- "release_type": { "type": "keyword", "include_in_all": false },
- "release_status": { "type": "keyword", "include_in_all": false },
- "language": { "type": "keyword", "include_in_all": false },
+ "release_type": { "type": "keyword" },
+ "release_status": { "type": "keyword" },
+ "language": { "type": "keyword" },
"doi": { "type": "keyword" },
"pmid": { "type": "keyword" },
"pmcid": { "type": "keyword" },
"isbn13": { "type": "keyword" },
- "core_id": { "type": "keyword", "include_in_all": false },
- "wikidata_qid": { "type": "keyword", "include_in_all": false },
- "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false },
- "container_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_issnl": { "type": "keyword", "include_in_all": false },
- "container_is_oa": { "type": "boolean", "include_in_all": false },
- "container_is_kept": { "type": "boolean", "include_in_all": false },
- "container_is_longtail_oa": { "type": "booloean", "include_in_all": false },
- "file_count": { "type": "number", "include_in_all": false },
- "file_pdf_url": { "type": "keyword", "include_in_all": false },
- "file_in_webarchive": { "type": "boolean", "include_in_all": false },
- "file_in_ia": { "type": "boolean", "include_in_all": false },
- "any_abstract": { "type": "boolean", "include_in_all": false },
- "in_shadow": { "type": "boolean", "include_in_all": false }
+ "core_id": { "type": "keyword" },
+ "wikidata_qid": { "type": "keyword" },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "container_issnl": { "type": "keyword" },
+ "container_is_oa": { "type": "boolean" },
+ "container_is_longtail_oa": { "type": "boolean" },
+ "contrib_count": { "type": "integer" },
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "ref_count": { "type": "integer" },
+ "file_count": { "type": "integer" },
+ "file_pdf_url": { "type": "keyword" },
+ "file_in_webarchive": { "type": "boolean" },
+ "file_in_ia": { "type": "boolean" },
+ "any_abstract": { "type": "boolean" },
+ "is_kept": { "type": "boolean" },
+ "in_shadow": { "type": "boolean" }
}
}
}
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py
index 30449e18..2f67977e 100755
--- a/extra/elasticsearch/transform_release.py
+++ b/extra/elasticsearch/transform_release.py
@@ -26,18 +26,19 @@ def transform(m):
)
container = m.get('container')
+ container_is_kept = False
if container:
t['publisher'] = countainer.get('publisher')
- t['container_title'] = countainer.get('title')
+ t['container_name'] = countainer.get('name')
t['container_issnl'] = countainer.get('issnl')
container_extra = container.get('extra')
if container_extra:
t['container_is_oa'] = container_extra.get('is_oa')
- t['container_is_kept'] = container_extra.get('is_kept')
+ container_is_kept = container_extra.get('is_kept', False)
t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
else:
t['publisher'] = m.get('publisher')
- t['container_title'] = m.get('container_title')
+ t['container_name'] = m.get('container_name')
files = m.get('files', [])
t['file_count'] = len(files)
@@ -62,11 +63,15 @@ def transform(m):
if extra:
t['in_shadow'] = extra.get('in_shadow')
t['any_abstract'] = bool(t.get('abstracts'))
+ t['is_kept'] = container_is_kept or extra.get('is_kept', False)
- author_names = []
- for contrib in m.get('contribs', []):
- if contrib.get('raw_name'):
- author_names.append(contrib.get('raw_name'))
+ t['ref_count'] = len(m.get('refs', []))
+ t['contrib_count'] = len(m.get('contribs', []))
+ contrib_names = []
+ for c in m.get('contribs', []):
+ if c.get('raw_name'):
+ contrib_names.append(c.get('raw_name'))
+ t['contrib_names'] = contrib_names
return t
def run():