diff options
| -rw-r--r-- | extra/elasticsearch/README.md | 47 | ||||
| -rw-r--r-- | extra/elasticsearch/release_schema.json | 52 | ||||
| -rwxr-xr-x | extra/elasticsearch/transform_release.py | 19 | 
3 files changed, 86 insertions, 32 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index b9800143..0d205903 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -25,8 +25,49 @@ relation is *removed*. For example, if a file match against a given release is  removed, the old release elastic object needs to be updated to remove the file  from it's `files`. -## TODO +## Loading Data + +Drop and rebuild the schema: + +    http delete :9200/fatcat +    http put :9200/fatcat < release_schema.json + +Put a single object (good for debugging): + +    head -n1 examples.json | http post :9200/fatcat/release/0 +    http get :9200/fatcat/release/0 + +Bulk insert from a file on disk: + +    esbulk -verbose -id ident -index fatcat -type release examples.json -"enum" types, distinct from "keyword"? +Or, in a bulk production live-stream conversion: + +    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat-releases -type release + +## Full-Text Querying + +A generic full-text "query string" query look like this (replace "blood" with +actual query string, and "size" field with the max results to return): + +    GET /fatcat/release/_search +    { +      "query": { +        "query_string": { +          "query": "blood", +          "analyzer": "textIcuSearch", +          "default_operator": "AND", +          "analyze_wildcard": true, +          "lenient": true, +          "fields": ["title^3", "contrib_names^3", "container_title"] +        } +      }, +      "size": 3 +    } + +In the results take `.hits.hits[]._source` as the objects; `.hits.total` is the +total number of search hits. + +## TODO -Other identifiers in search index? core, wikidata +- file URL domains? seems heavy diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 89359de4..22177c42 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -25,35 +25,43 @@      }  },  "mappings": { -    "work": { -        "_all": { "enabled": true }, +    "release": {          "properties": { -            "ident":          { "type": "keyword", "include_in_all": false }, -            "revision":       { "type": "keyword", "include_in_all": false }, +            "ident":          { "type": "keyword" }, +            "revision":       { "type": "keyword" },              "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "author_names":   { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "author":         { "type": "alias", "path": "contrib_names" }, +            "journal":        { "type": "alias", "path": "container_name" }, +            "date":           { "type": "alias", "path": "release_date" }, +            "issn":           { "type": "alias", "path": "container_issnl" }, +            "oa":             { "type": "alias", "path": "container_is_oa" }, +            "kept":           { "type": "alias", "path": "container_is_kept" }, +            "longtail":       { "type": "alias", "path": "container_is_longtail_oa" },              "release_date":   { "type": "date" }, -            "release_type":   { "type": "keyword", "include_in_all": false }, -            "release_status": { "type": "keyword", "include_in_all": false }, -            "language": { "type": "keyword", "include_in_all": false }, +            "release_type":   { "type": "keyword" }, +            "release_status": { "type": "keyword" }, +            "language": { "type": "keyword" },              "doi":      { "type": "keyword" },              "pmid":     { "type": "keyword" },              "pmcid":    { "type": "keyword" },              "isbn13":   { "type": "keyword" }, -            "core_id":      { "type": "keyword", "include_in_all": false }, -            "wikidata_qid": { "type": "keyword", "include_in_all": false }, -            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "include_in_all": false }, -            "container_title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "container_issnl":          { "type": "keyword", "include_in_all": false }, -            "container_is_oa":          { "type": "boolean", "include_in_all": false }, -            "container_is_kept":        { "type": "boolean", "include_in_all": false }, -            "container_is_longtail_oa": { "type": "booloean", "include_in_all": false }, -            "file_count":           { "type": "number", "include_in_all": false }, -            "file_pdf_url":         { "type": "keyword", "include_in_all": false }, -            "file_in_webarchive":   { "type": "boolean", "include_in_all": false }, -            "file_in_ia":           { "type": "boolean", "include_in_all": false }, -            "any_abstract":         { "type": "boolean", "include_in_all": false }, -            "in_shadow":            { "type": "boolean", "include_in_all": false } +            "core_id":      { "type": "keyword" }, +            "wikidata_qid": { "type": "keyword" }, +            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "container_name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "container_issnl":          { "type": "keyword" }, +            "container_is_oa":          { "type": "boolean" }, +            "container_is_longtail_oa": { "type": "boolean" }, +            "contrib_count":        { "type": "integer" }, +            "contrib_names":  { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "ref_count":            { "type": "integer" }, +            "file_count":           { "type": "integer" }, +            "file_pdf_url":         { "type": "keyword" }, +            "file_in_webarchive":   { "type": "boolean" }, +            "file_in_ia":           { "type": "boolean" }, +            "any_abstract":         { "type": "boolean" }, +            "is_kept":              { "type": "boolean" }, +            "in_shadow":            { "type": "boolean" }          }      }  } diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py index 30449e18..2f67977e 100755 --- a/extra/elasticsearch/transform_release.py +++ b/extra/elasticsearch/transform_release.py @@ -26,18 +26,19 @@ def transform(m):      )      container = m.get('container') +    container_is_kept = False      if container:          t['publisher'] = countainer.get('publisher') -        t['container_title'] = countainer.get('title') +        t['container_name'] = countainer.get('name')          t['container_issnl'] = countainer.get('issnl')          container_extra = container.get('extra')          if container_extra:              t['container_is_oa'] = container_extra.get('is_oa') -            t['container_is_kept'] = container_extra.get('is_kept') +            container_is_kept = container_extra.get('is_kept', False)              t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')      else:          t['publisher'] = m.get('publisher') -        t['container_title'] = m.get('container_title') +        t['container_name'] = m.get('container_name')      files = m.get('files', [])      t['file_count'] = len(files) @@ -62,11 +63,15 @@ def transform(m):      if extra:          t['in_shadow'] = extra.get('in_shadow')      t['any_abstract'] = bool(t.get('abstracts')) +    t['is_kept'] = container_is_kept or extra.get('is_kept', False) -    author_names = [] -    for contrib in m.get('contribs', []): -        if contrib.get('raw_name'): -            author_names.append(contrib.get('raw_name')) +    t['ref_count'] = len(m.get('refs', [])) +    t['contrib_count'] = len(m.get('contribs', [])) +    contrib_names = [] +    for c in m.get('contribs', []): +        if c.get('raw_name'): +            contrib_names.append(c.get('raw_name')) +    t['contrib_names'] = contrib_names      return t  def run():  | 
