diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-02-26 22:05:43 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-02-26 22:05:43 -0800 |
commit | ae50ee2274031ddc178fa4a10b59280e8440a24c (patch) | |
tree | 5a17d5d5f875ab6ff770c955b25626c6f36d16a8 /extra/elasticsearch | |
parent | 1556cdd7f0f5f4bc4fe5ccc9764c1598c852bb9b (diff) | |
parent | 81e0784813500a39955c20278140e25d7940d9c6 (diff) | |
download | fatcat-ae50ee2274031ddc178fa4a10b59280e8440a24c.tar.gz fatcat-ae50ee2274031ddc178fa4a10b59280e8440a24c.zip |
Merge branch 'bnewbold-elastic-v03b'
Diffstat (limited to 'extra/elasticsearch')
-rw-r--r-- | extra/elasticsearch/README.md | 7 | ||||
-rw-r--r-- | extra/elasticsearch/changelog_schema.json | 29 | ||||
-rw-r--r-- | extra/elasticsearch/container_schema.json | 68 | ||||
-rw-r--r-- | extra/elasticsearch/file_schema.json | 59 | ||||
-rw-r--r-- | extra/elasticsearch/release_schema.json | 101 |
5 files changed, 203 insertions, 61 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 3a48a178..df4cb918 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -40,9 +40,11 @@ Drop and rebuild the schema: http delete :9200/fatcat_release http delete :9200/fatcat_container + http delete :9200/fatcat_file http delete :9200/fatcat_changelog http put :9200/fatcat_release < release_schema.json http put :9200/fatcat_container < container_schema.json + http put :9200/fatcat_file < file_schema.json http put :9200/fatcat_changelog < changelog_schema.json Put a single object (good for debugging): @@ -57,8 +59,9 @@ Bulk insert from a file on disk: Or, in a bulk production live-stream conversion: export LC_ALL=C.UTF-8 - time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_release -type release - time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_release -type release + time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_file -type file ## Index Aliases diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index f3211e99..d8342549 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -16,20 +28,29 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword" }, + "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, - "editor_id": { "type": "keyword" }, - "username": { "type": "keyword" }, + "editor_id": { "type": "keyword", "normalizer": "default" }, + "username": { "type": "keyword", "normalizer": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword" }, + "agent": { "type": "keyword", "normalizer": "caseSensitive" }, + "containers": { "type": "integer" }, + "new_containers": { "type": "integer" }, "creators": { "type": "integer" }, + "new_creators": { "type": "integer" }, "files": { "type": "integer" }, + "new_files": { "type": "integer" }, "filessets": { "type": "integer" }, + "new_filessets": { "type": "integer" }, "webcaptures": { "type": "integer" }, + "new_webcaptures": { "type": "integer" }, "releases": { "type": "integer" }, + "new_releases": { "type": "integer" }, "works": { "type": "integer" }, + "new_works": { "type": "integer" }, + "created": { "type": "integer" }, "updated": { "type": "integer" }, "deleted": { "type": "integer" }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index b0a47e85..5cd85b04 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,43 +39,51 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword" }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_type": { "type": "keyword" }, - "issnl": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "country": { "type": "keyword" }, - "region": { "type": "keyword" }, - "discipline": { "type": "keyword" }, - "languages": { "type": "keyword" }, - "mimetypes": { "type": "keyword" }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "issnl": { "type": "keyword", "normalizer": "default" }, + "issns": { "type": "keyword", "normalizer": "default" }, + "wikidata_qid": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "region": { "type": "keyword", "normalizer": "default" }, + "discipline": { "type": "keyword", "normalizer": "default" }, + "languages": { "type": "keyword", "normalizer": "default" }, + "mimetypes": { "type": "keyword", "normalizer": "default" }, "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, - "in_doaj": { "type": "boolean" }, - "in_road": { "type": "boolean" }, - "in_doi": { "type": "boolean" }, - "in_sherpa_romeo":{ "type": "boolean" }, - "is_oa": { "type": "boolean" }, - "is_longtail_oa": { "type": "boolean" }, - "any_kbart": { "type": "boolean" }, - "any_jstor": { "type": "boolean" }, - "any_ia_sim": { "type": "boolean" }, + + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "in_doaj": { "type": "boolean" }, + "in_road": { "type": "boolean" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "any_kbart": { "type": "boolean" }, + "any_jstor": { "type": "boolean" }, + "any_ia_sim": { "type": "boolean" }, + "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, "releases_ia": { "type": "integer" }, - "releases_sim": { "type": "integer" }, - "releases_shadow": { "type": "integer" }, + "releases_ia_sim": { "type": "integer" }, + "releases_shadows": { "type": "integer" }, "releases_any_file": { "type": "integer" }, "releases_any_fileset": { "type": "integer" }, "releases_any_webcapture": { "type": "integer" }, "year": { "type": "alias", "path": "first_year" }, "type": { "type": "alias", "path": "container_type" }, + "issn": { "type": "alias", "path": "issns" }, "oa": { "type": "alias", "path": "is_oa" }, "longtail": { "type": "alias", "path": "is_longtail_oa" } } diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json new file mode 100644 index 00000000..9c8ee64c --- /dev/null +++ b/extra/elasticsearch/file_schema.json @@ -0,0 +1,59 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } + } + } + } +}, +"mappings": { + "file": { + "properties": { + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "release_count": { "type": "integer" }, + "mimetype": { "type": "keyword", "normalizer": "default" }, + "size_bytes": { "type": "integer" }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "md5": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "domains": { "type": "keyword", "normalizer": "default" }, + "hosts": { "type": "keyword", "normalizer": "default" }, + "rels": { "type": "keyword", "normalizer": "default" }, + "in_ia": { "type": "boolean" }, + "in_ia_petabox": { "type": "boolean" }, + + "release_id": { "type": "alias", "path": "release_ids" }, + "sha1hex": { "type": "alias", "path": "sha1" }, + "sha256hex": { "type": "alias", "path": "sha256" }, + "md5hex": { "type": "alias", "path": "md5" }, + "size": { "type": "alias", "path": "size_bytes" }, + "domain": { "type": "alias", "path": "domains" }, + "host": { "type": "alias", "path": "hosts" }, + "rel": { "type": "alias", "path": "rels" } + } + } +} +} diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 85026060..666a672f 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,48 +39,66 @@ "mappings": { "release": { "properties": { - "ident": { "type": "keyword" }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "work_id": { "type": "keyword" }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default" }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, - "release_year": { "type": "integer" }, - "release_type": { "type": "keyword" }, - "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword" }, - "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "axiv_id": { "type": "keyword" }, - "jstor_id": { "type": "keyword" }, - "ark_id": { "type": "keyword" }, - "mag_id": { "type": "keyword" }, - "license": { "type": "keyword" }, + "release_year": { "type": "integer", "copy_to": "biblio" }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "language": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "country_code_upper": { "type": "keyword", "normalizer": "caseSensitive" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "doi": { "type": "keyword", "normalizer": "default" }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_id": { "type": "keyword" }, - "container_issnl": { "type": "keyword" }, - "container_type": { "type": "keyword" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "creator_ids": { "type": "keyword" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "affiliation_rors": { "type": "keyword", "normalizer": "default" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, + "ref_release_ids": { "type": "keyword", "normalizer": "default" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, "any_abstract": { "type": "boolean" }, - "best_pdf_url": { "type": "keyword" }, - "ia_pdf_url": { "type": "keyword" }, + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, "is_oa": { "type": "boolean" }, + "oa_color": { "type": "keyword", "normalizer": "default" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -79,7 +109,13 @@ "in_ia_sim": { "type": "boolean" }, "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, + "is_retracted": { "type": "boolean" }, + "preservation": { "type": "keyword", "normalizer": "default" }, + "affiliation": { "type": "alias", "path": "affiliations" }, + "ror": { "type": "alias", "path": "affiliation_rors" }, + "creator_id": { "type": "alias", "path": "creator_ids" }, + "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, "date": { "type": "alias", "path": "release_date" }, @@ -90,6 +126,9 @@ "lang": { "type": "alias", "path": "language" }, "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, "release_status": { "type": "alias", "path": "release_stage" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + "retracted": { "type": "alias", "path": "is_retracted" }, "is_kept": { "type": "alias", "path": "in_kbart" } } } |