From 2eedc871a71e83b126f98ca5915a463a8cd50ccc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 22 Sep 2018 17:30:21 -0700 Subject: update elastic schema and transform --- extra/elasticsearch/README.md | 47 +++++++++++++++++++++++++++-- extra/elasticsearch/release_schema.json | 52 ++++++++++++++++++-------------- extra/elasticsearch/transform_release.py | 19 +++++++----- 3 files changed, 86 insertions(+), 32 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index b9800143..0d205903 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -25,8 +25,49 @@ relation is *removed*. For example, if a file match against a given release is removed, the old release elastic object needs to be updated to remove the file from it's `files`. -## TODO +## Loading Data + +Drop and rebuild the schema: + + http delete :9200/fatcat + http put :9200/fatcat < release_schema.json + +Put a single object (good for debugging): + + head -n1 examples.json | http post :9200/fatcat/release/0 + http get :9200/fatcat/release/0 + +Bulk insert from a file on disk: + + esbulk -verbose -id ident -index fatcat -type release examples.json -"enum" types, distinct from "keyword"? +Or, in a bulk production live-stream conversion: + + time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat-releases -type release + +## Full-Text Querying + +A generic full-text "query string" query look like this (replace "blood" with +actual query string, and "size" field with the max results to return): + + GET /fatcat/release/_search + { + "query": { + "query_string": { + "query": "blood", + "analyzer": "textIcuSearch", + "default_operator": "AND", + "analyze_wildcard": true, + "lenient": true, + "fields": ["title^3", "contrib_names^3", "container_title"] + } + }, + "size": 3 + } + +In the results take `.hits.hits[]._source` as the objects; `.hits.total` is the +total number of search hits. + +## TODO -Other identifiers in search index? core, wikidata +- file URL domains? seems heavy diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 89359de4..22177c42 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -25,35 +25,43 @@ } }, "mappings": { - "work": { - "_all": { "enabled": true }, + "release": { "properties": { - "ident": { "type": "keyword", "include_in_all": false }, - "revision": { "type": "keyword", "include_in_all": false }, + "ident": { "type": "keyword" }, + "revision": { "type": "keyword" }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "author_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "author": { "type": "alias", "path": "contrib_names" }, + "journal": { "type": "alias", "path": "container_name" }, + "date": { "type": "alias", "path": "release_date" }, + "issn": { "type": "alias", "path": "container_issnl" }, + "oa": { "type": "alias", "path": "container_is_oa" }, + "kept": { "type": "alias", "path": "container_is_kept" }, + "longtail": { "type": "alias", "path": "container_is_longtail_oa" }, "release_date": { "type": "date" }, - "release_type": { "type": "keyword", "include_in_all": false }, - "release_status": { "type": "keyword", "include_in_all": false }, - "language": { "type": "keyword", "include_in_all": false }, + "release_type": { "type": "keyword" }, + "release_status": { "type": "keyword" }, + "language": { "type": "keyword" }, "doi": { "type": "keyword" }, "pmid": { "type": "keyword" }, "pmcid": { "type": "keyword" }, "isbn13": { "type": "keyword" }, - "core_id": { "type": "keyword", "include_in_all": false }, - "wikidata_qid": { "type": "keyword", "include_in_all": false }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false }, - "container_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_issnl": { "type": "keyword", "include_in_all": false }, - "container_is_oa": { "type": "boolean", "include_in_all": false }, - "container_is_kept": { "type": "boolean", "include_in_all": false }, - "container_is_longtail_oa": { "type": "booloean", "include_in_all": false }, - "file_count": { "type": "number", "include_in_all": false }, - "file_pdf_url": { "type": "keyword", "include_in_all": false }, - "file_in_webarchive": { "type": "boolean", "include_in_all": false }, - "file_in_ia": { "type": "boolean", "include_in_all": false }, - "any_abstract": { "type": "boolean", "include_in_all": false }, - "in_shadow": { "type": "boolean", "include_in_all": false } + "core_id": { "type": "keyword" }, + "wikidata_qid": { "type": "keyword" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_issnl": { "type": "keyword" }, + "container_is_oa": { "type": "boolean" }, + "container_is_longtail_oa": { "type": "boolean" }, + "contrib_count": { "type": "integer" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "ref_count": { "type": "integer" }, + "file_count": { "type": "integer" }, + "file_pdf_url": { "type": "keyword" }, + "file_in_webarchive": { "type": "boolean" }, + "file_in_ia": { "type": "boolean" }, + "any_abstract": { "type": "boolean" }, + "is_kept": { "type": "boolean" }, + "in_shadow": { "type": "boolean" } } } } diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py index 30449e18..2f67977e 100755 --- a/extra/elasticsearch/transform_release.py +++ b/extra/elasticsearch/transform_release.py @@ -26,18 +26,19 @@ def transform(m): ) container = m.get('container') + container_is_kept = False if container: t['publisher'] = countainer.get('publisher') - t['container_title'] = countainer.get('title') + t['container_name'] = countainer.get('name') t['container_issnl'] = countainer.get('issnl') container_extra = container.get('extra') if container_extra: t['container_is_oa'] = container_extra.get('is_oa') - t['container_is_kept'] = container_extra.get('is_kept') + container_is_kept = container_extra.get('is_kept', False) t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') else: t['publisher'] = m.get('publisher') - t['container_title'] = m.get('container_title') + t['container_name'] = m.get('container_name') files = m.get('files', []) t['file_count'] = len(files) @@ -62,11 +63,15 @@ def transform(m): if extra: t['in_shadow'] = extra.get('in_shadow') t['any_abstract'] = bool(t.get('abstracts')) + t['is_kept'] = container_is_kept or extra.get('is_kept', False) - author_names = [] - for contrib in m.get('contribs', []): - if contrib.get('raw_name'): - author_names.append(contrib.get('raw_name')) + t['ref_count'] = len(m.get('refs', [])) + t['contrib_count'] = len(m.get('contribs', [])) + contrib_names = [] + for c in m.get('contribs', []): + if c.get('raw_name'): + contrib_names.append(c.get('raw_name')) + t['contrib_names'] = contrib_names return t def run(): -- cgit v1.2.3