diff options
Diffstat (limited to 'extra/elasticsearch')
| -rw-r--r-- | extra/elasticsearch/README.md | 7 | ||||
| -rw-r--r-- | extra/elasticsearch/changelog_schema.json | 29 | ||||
| -rw-r--r-- | extra/elasticsearch/container_schema.json | 68 | ||||
| -rw-r--r-- | extra/elasticsearch/file_schema.json | 59 | ||||
| -rw-r--r-- | extra/elasticsearch/release_schema.json | 101 | 
5 files changed, 203 insertions, 61 deletions
| diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 3a48a178..17865bc0 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -40,9 +40,11 @@ Drop and rebuild the schema:      http delete :9200/fatcat_release      http delete :9200/fatcat_container +    http delete :9200/fatcat_file      http delete :9200/fatcat_changelog      http put :9200/fatcat_release < release_schema.json      http put :9200/fatcat_container < container_schema.json +    http put :9200/fatcat_file < file_schema.json      http put :9200/fatcat_changelog < changelog_schema.json  Put a single object (good for debugging): @@ -57,8 +59,9 @@ Bulk insert from a file on disk:  Or, in a bulk production live-stream conversion:      export LC_ALL=C.UTF-8 -    time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_release -type release -    time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_container -type container +    time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type release +    time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type container +    time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type file  ## Index Aliases diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index f3211e99..d8342549 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -8,6 +8,18 @@                      "tokenizer": "standard",                      "filter": [ "lowercase", "asciifolding" ]                  } +            }, +            "normalizer": { +                "default": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": ["lowercase"] +                }, +                "caseSensitive": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": [] +                }              }          }      } @@ -16,20 +28,29 @@      "changelog": {          "properties": {              "index":            { "type": "integer" }, -            "editgroup_id":     { "type": "keyword" }, +            "editgroup_id":     { "type": "keyword", "normalizer": "default", "doc_values": false },              "timestamp":        { "type": "date" }, -            "editor_id":        { "type": "keyword" }, -            "username":         { "type": "keyword" }, +            "editor_id":        { "type": "keyword", "normalizer": "default" }, +            "username":         { "type": "keyword", "normalizer": "caseSensitive" },              "is_bot":           { "type": "boolean" },              "is_admin":         { "type": "boolean" }, -            "agent":            { "type": "keyword" }, +            "agent":            { "type": "keyword", "normalizer": "caseSensitive" }, +              "containers":       { "type": "integer" }, +            "new_containers":   { "type": "integer" },              "creators":         { "type": "integer" }, +            "new_creators":     { "type": "integer" },              "files":            { "type": "integer" }, +            "new_files":        { "type": "integer" },              "filessets":        { "type": "integer" }, +            "new_filessets":    { "type": "integer" },              "webcaptures":      { "type": "integer" }, +            "new_webcaptures":  { "type": "integer" },              "releases":         { "type": "integer" }, +            "new_releases":     { "type": "integer" },              "works":            { "type": "integer" }, +            "new_works":        { "type": "integer" }, +              "created":          { "type": "integer" },              "updated":          { "type": "integer" },              "deleted":          { "type": "integer" }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index b0a47e85..5cd85b04 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -20,6 +20,18 @@                      "char_filter": [ "icu_normalizer" ],                      "filter": [ "icu_folding" ]                  } +            }, +            "normalizer": { +                "default": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": ["lowercase"] +                }, +                "caseSensitive": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": [] +                }              }          }      } @@ -27,43 +39,51 @@  "mappings": {      "container": {          "properties": { -            "ident":          { "type": "keyword" }, -            "state":          { "type": "keyword" }, -            "revision":       { "type": "keyword" }, -            "name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "publisher":      { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "container_type": { "type": "keyword" }, -            "issnl":          { "type": "keyword" }, -            "wikidata_qid":   { "type": "keyword" }, -            "country":        { "type": "keyword" }, -            "region":         { "type": "keyword" }, -            "discipline":     { "type": "keyword" }, -            "languages":      { "type": "keyword" }, -            "mimetypes":      { "type": "keyword" }, +            "ident":          { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "state":          { "type": "keyword", "normalizer": "default" }, +            "revision":       { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "original_name":  { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "publisher":      { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "abbrev":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "aliases":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "publisher_type": { "type": "keyword", "normalizer": "default" }, +            "container_type": { "type": "keyword", "normalizer": "default" }, +            "issnl":          { "type": "keyword", "normalizer": "default" }, +            "issns":          { "type": "keyword", "normalizer": "default" }, +            "wikidata_qid":   { "type": "keyword", "normalizer": "default" }, +            "country_code":   { "type": "keyword", "normalizer": "default" }, +            "region":         { "type": "keyword", "normalizer": "default" }, +            "discipline":     { "type": "keyword", "normalizer": "default" }, +            "languages":      { "type": "keyword", "normalizer": "default" }, +            "mimetypes":      { "type": "keyword", "normalizer": "default" },              "first_year":     { "type": "integer" },              "last_year":      { "type": "integer" }, -            "in_doaj":        { "type": "boolean" }, -            "in_road":        { "type": "boolean" }, -            "in_doi":         { "type": "boolean" }, -            "in_sherpa_romeo":{ "type": "boolean" }, -            "is_oa":          { "type": "boolean" }, -            "is_longtail_oa": { "type": "boolean" }, -            "any_kbart":      { "type": "boolean" }, -            "any_jstor":      { "type": "boolean" }, -            "any_ia_sim":        { "type": "boolean" }, + +            "biblio":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + +            "in_doaj":              { "type": "boolean" }, +            "in_road":              { "type": "boolean" }, +            "is_oa":                { "type": "boolean" }, +            "is_longtail_oa":       { "type": "boolean" }, +            "any_kbart":            { "type": "boolean" }, +            "any_jstor":            { "type": "boolean" }, +            "any_ia_sim":           { "type": "boolean" }, +            "sherpa_romeo_color":   { "type": "keyword", "normalizer": "default" },              "releases_total": { "type": "integer" },              "releases_kbart": { "type": "integer" },              "releases_ia":    { "type": "integer" }, -            "releases_sim":   { "type": "integer" }, -            "releases_shadow":          { "type": "integer" }, +            "releases_ia_sim":          { "type": "integer" }, +            "releases_shadows":         { "type": "integer" },              "releases_any_file":        { "type": "integer" },              "releases_any_fileset":     { "type": "integer" },              "releases_any_webcapture":  { "type": "integer" },              "year":           { "type": "alias", "path": "first_year" },              "type":           { "type": "alias", "path": "container_type" }, +            "issn":           { "type": "alias", "path": "issns" },              "oa":             { "type": "alias", "path": "is_oa" },              "longtail":       { "type": "alias", "path": "is_longtail_oa" }          } diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json new file mode 100644 index 00000000..9c8ee64c --- /dev/null +++ b/extra/elasticsearch/file_schema.json @@ -0,0 +1,59 @@ +{ +"settings": { +    "index": { +        "analysis": { +            "analyzer": { +                "default": { +                    "type": "custom", +                    "tokenizer": "standard", +                    "filter": [ "lowercase", "asciifolding" ] +                } +            }, +            "normalizer": { +                "default": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": ["lowercase"] +                }, +                "caseSensitive": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": [] +                } +            } +        } +    } +}, +"mappings": { +    "file": { +        "properties": { +            "ident":            { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "state":            { "type": "keyword", "normalizer": "default" }, +            "revision":         { "type": "keyword", "normalizer": "default", "doc_values": false }, + +            "release_ids":      { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "release_count":    { "type": "integer" }, +            "mimetype":         { "type": "keyword", "normalizer": "default" }, +            "size_bytes":       { "type": "integer" }, +            "sha1":             { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "sha256":           { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "md5":              { "type": "keyword", "normalizer": "default", "doc_values": false }, + +            "domains":          { "type": "keyword", "normalizer": "default" }, +            "hosts":            { "type": "keyword", "normalizer": "default" }, +            "rels":             { "type": "keyword", "normalizer": "default" }, +            "in_ia":            { "type": "boolean" }, +            "in_ia_petabox":    { "type": "boolean" }, + +            "release_id":       { "type": "alias", "path": "release_ids" }, +            "sha1hex":          { "type": "alias", "path": "sha1" }, +            "sha256hex":        { "type": "alias", "path": "sha256" }, +            "md5hex":           { "type": "alias", "path": "md5" }, +            "size":             { "type": "alias", "path": "size_bytes" }, +            "domain":           { "type": "alias", "path": "domains" }, +            "host":             { "type": "alias", "path": "hosts" }, +            "rel":              { "type": "alias", "path": "rels" } +        } +    } +} +} diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 85026060..666a672f 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -20,6 +20,18 @@                      "char_filter": [ "icu_normalizer" ],                      "filter": [ "icu_folding" ]                  } +            }, +            "normalizer": { +                "default": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": ["lowercase"] +                }, +                "caseSensitive": { +                    "type": "custom", +                    "char_filter": [], +                    "filter": [] +                }              }          }      } @@ -27,48 +39,66 @@  "mappings": {      "release": {          "properties": { -            "ident":          { "type": "keyword" }, -            "state":          { "type": "keyword" }, -            "revision":       { "type": "keyword" }, -            "work_id":        { "type": "keyword" }, -            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "subtitle":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "ident":          { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "state":          { "type": "keyword", "normalizer": "default" }, +            "revision":       { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "work_id":        { "type": "keyword", "normalizer": "default" }, +            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "subtitle":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },              "release_date":   { "type": "date" }, -            "release_year":   { "type": "integer" }, -            "release_type":   { "type": "keyword" }, -            "release_stage":  { "type": "keyword" }, -            "withdrawn_status": { "type": "keyword" }, -            "language":       { "type": "keyword" }, -            "doi":            { "type": "keyword" }, -            "pmid":           { "type": "keyword" }, -            "pmcid":          { "type": "keyword" }, -            "isbn13":         { "type": "keyword" }, -            "wikidata_qid":   { "type": "keyword" }, -            "core_id":        { "type": "keyword" }, -            "axiv_id":        { "type": "keyword" }, -            "jstor_id":       { "type": "keyword" }, -            "ark_id":         { "type": "keyword" }, -            "mag_id":         { "type": "keyword" }, -            "license":        { "type": "keyword" }, +            "release_year":   { "type": "integer", "copy_to": "biblio" }, +            "release_type":   { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, +            "release_stage":  { "type": "keyword", "normalizer": "default" }, +            "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, +            "language":       { "type": "keyword", "normalizer": "default" }, +            "country_code":        { "type": "keyword", "normalizer": "default" }, +            "country_code_upper":  { "type": "keyword", "normalizer": "caseSensitive" }, +            "volume":         { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, +            "issue":          { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, +            "pages":          { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, +            "first_page":     { "type": "keyword", "normalizer": "default" }, +            "number":         { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, +            "doi":            { "type": "keyword", "normalizer": "default" }, +            "doi_prefix":     { "type": "keyword", "normalizer": "default" }, +            "doi_registrar":  { "type": "keyword", "normalizer": "default" }, +            "pmid":           { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "pmcid":          { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "isbn13":         { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "wikidata_qid":   { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "core_id":        { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "arxiv_id":       { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "jstor_id":       { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "ark_id":         { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "mag_id":         { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "s2_id":          { "type": "keyword", "normalizer": "default", "doc_values": false }, +            "license":        { "type": "keyword", "normalizer": "default" },              "publisher":            { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "container_name":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "container_id":         { "type": "keyword" }, -            "container_issnl":      { "type": "keyword" }, -            "container_type":       { "type": "keyword" }, +            "publisher_type":       { "type": "keyword", "normalizer": "default" }, +            "container_name":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "container_id":         { "type": "keyword", "normalizer": "default" }, +            "container_issnl":      { "type": "keyword", "normalizer": "default" }, +            "container_type":       { "type": "keyword", "normalizer": "default" },              "contrib_count":        { "type": "integer" }, -            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "creator_ids":          { "type": "keyword" }, +            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, +            "affiliations":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "affiliation_rors":     { "type": "keyword", "normalizer": "default" }, +            "creator_ids":          { "type": "keyword", "normalizer": "default" },              "ref_count":            { "type": "integer" },              "ref_linked_count":     { "type": "integer" }, +            "ref_release_ids":      { "type": "keyword", "normalizer": "default" },              "file_count":           { "type": "integer" },              "fileset_count":        { "type": "integer" },              "webcapture_count":     { "type": "integer" },              "any_abstract":         { "type": "boolean" }, -            "best_pdf_url":         { "type": "keyword" }, -            "ia_pdf_url":           { "type": "keyword" }, +            "biblio":               { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + +            "best_pdf_url":         { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, +            "ia_pdf_url":           { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, +            "ia_microfilm_url":     { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },              "is_oa":                { "type": "boolean" }, +            "oa_color":             { "type": "keyword", "normalizer": "default" },              "is_longtail_oa":       { "type": "boolean" },              "is_preserved":         { "type": "boolean" },              "in_kbart":             { "type": "boolean" }, @@ -79,7 +109,13 @@              "in_ia_sim":            { "type": "boolean" },              "in_shadows":           { "type": "boolean" },              "is_superceded":        { "type": "boolean" }, +            "is_retracted":         { "type": "boolean" }, +            "preservation":         { "type": "keyword", "normalizer": "default" }, +            "affiliation":    { "type": "alias", "path": "affiliations" }, +            "ror":            { "type": "alias", "path": "affiliation_rors" }, +            "creator_id":     { "type": "alias", "path": "creator_ids" }, +            "ref_release_id": { "type": "alias", "path": "ref_release_ids" },              "author":         { "type": "alias", "path": "contrib_names" },              "journal":        { "type": "alias", "path": "container_name" },              "date":           { "type": "alias", "path": "release_date" }, @@ -90,6 +126,9 @@              "lang":           { "type": "alias", "path": "language" },              "file_pdf_url":   { "type": "alias", "path": "best_pdf_url" },              "release_status": { "type": "alias", "path": "release_stage" }, +            "stage":          { "type": "alias", "path": "release_stage" }, +            "type":           { "type": "alias", "path": "release_type" }, +            "retracted":      { "type": "alias", "path": "is_retracted" },              "is_kept":        { "type": "alias", "path": "in_kbart" }          }      } | 
