diff options
Diffstat (limited to 'extra')
-rw-r--r-- | extra/elasticsearch/README.md | 7 | ||||
-rw-r--r-- | extra/elasticsearch/changelog_schema.json | 29 | ||||
-rw-r--r-- | extra/elasticsearch/container_schema.json | 68 | ||||
-rw-r--r-- | extra/elasticsearch/file_schema.json | 59 | ||||
-rw-r--r-- | extra/elasticsearch/release_schema.json | 101 | ||||
-rw-r--r-- | extra/stats/2020-02-19-prod-stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2020-02-19-prod-table-sizes.txt | 46 |
7 files changed, 250 insertions, 61 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 3a48a178..17865bc0 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -40,9 +40,11 @@ Drop and rebuild the schema: http delete :9200/fatcat_release http delete :9200/fatcat_container + http delete :9200/fatcat_file http delete :9200/fatcat_changelog http put :9200/fatcat_release < release_schema.json http put :9200/fatcat_container < container_schema.json + http put :9200/fatcat_file < file_schema.json http put :9200/fatcat_changelog < changelog_schema.json Put a single object (good for debugging): @@ -57,8 +59,9 @@ Bulk insert from a file on disk: Or, in a bulk production live-stream conversion: export LC_ALL=C.UTF-8 - time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_release -type release - time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type release + time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type file ## Index Aliases diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index f3211e99..d8342549 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -16,20 +28,29 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword" }, + "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, - "editor_id": { "type": "keyword" }, - "username": { "type": "keyword" }, + "editor_id": { "type": "keyword", "normalizer": "default" }, + "username": { "type": "keyword", "normalizer": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword" }, + "agent": { "type": "keyword", "normalizer": "caseSensitive" }, + "containers": { "type": "integer" }, + "new_containers": { "type": "integer" }, "creators": { "type": "integer" }, + "new_creators": { "type": "integer" }, "files": { "type": "integer" }, + "new_files": { "type": "integer" }, "filessets": { "type": "integer" }, + "new_filessets": { "type": "integer" }, "webcaptures": { "type": "integer" }, + "new_webcaptures": { "type": "integer" }, "releases": { "type": "integer" }, + "new_releases": { "type": "integer" }, "works": { "type": "integer" }, + "new_works": { "type": "integer" }, + "created": { "type": "integer" }, "updated": { "type": "integer" }, "deleted": { "type": "integer" }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index b0a47e85..5cd85b04 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,43 +39,51 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword" }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_type": { "type": "keyword" }, - "issnl": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "country": { "type": "keyword" }, - "region": { "type": "keyword" }, - "discipline": { "type": "keyword" }, - "languages": { "type": "keyword" }, - "mimetypes": { "type": "keyword" }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "issnl": { "type": "keyword", "normalizer": "default" }, + "issns": { "type": "keyword", "normalizer": "default" }, + "wikidata_qid": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "region": { "type": "keyword", "normalizer": "default" }, + "discipline": { "type": "keyword", "normalizer": "default" }, + "languages": { "type": "keyword", "normalizer": "default" }, + "mimetypes": { "type": "keyword", "normalizer": "default" }, "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, - "in_doaj": { "type": "boolean" }, - "in_road": { "type": "boolean" }, - "in_doi": { "type": "boolean" }, - "in_sherpa_romeo":{ "type": "boolean" }, - "is_oa": { "type": "boolean" }, - "is_longtail_oa": { "type": "boolean" }, - "any_kbart": { "type": "boolean" }, - "any_jstor": { "type": "boolean" }, - "any_ia_sim": { "type": "boolean" }, + + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "in_doaj": { "type": "boolean" }, + "in_road": { "type": "boolean" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "any_kbart": { "type": "boolean" }, + "any_jstor": { "type": "boolean" }, + "any_ia_sim": { "type": "boolean" }, + "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, "releases_ia": { "type": "integer" }, - "releases_sim": { "type": "integer" }, - "releases_shadow": { "type": "integer" }, + "releases_ia_sim": { "type": "integer" }, + "releases_shadows": { "type": "integer" }, "releases_any_file": { "type": "integer" }, "releases_any_fileset": { "type": "integer" }, "releases_any_webcapture": { "type": "integer" }, "year": { "type": "alias", "path": "first_year" }, "type": { "type": "alias", "path": "container_type" }, + "issn": { "type": "alias", "path": "issns" }, "oa": { "type": "alias", "path": "is_oa" }, "longtail": { "type": "alias", "path": "is_longtail_oa" } } diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json new file mode 100644 index 00000000..9c8ee64c --- /dev/null +++ b/extra/elasticsearch/file_schema.json @@ -0,0 +1,59 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } + } + } + } +}, +"mappings": { + "file": { + "properties": { + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "release_count": { "type": "integer" }, + "mimetype": { "type": "keyword", "normalizer": "default" }, + "size_bytes": { "type": "integer" }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "md5": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "domains": { "type": "keyword", "normalizer": "default" }, + "hosts": { "type": "keyword", "normalizer": "default" }, + "rels": { "type": "keyword", "normalizer": "default" }, + "in_ia": { "type": "boolean" }, + "in_ia_petabox": { "type": "boolean" }, + + "release_id": { "type": "alias", "path": "release_ids" }, + "sha1hex": { "type": "alias", "path": "sha1" }, + "sha256hex": { "type": "alias", "path": "sha256" }, + "md5hex": { "type": "alias", "path": "md5" }, + "size": { "type": "alias", "path": "size_bytes" }, + "domain": { "type": "alias", "path": "domains" }, + "host": { "type": "alias", "path": "hosts" }, + "rel": { "type": "alias", "path": "rels" } + } + } +} +} diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 85026060..666a672f 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,48 +39,66 @@ "mappings": { "release": { "properties": { - "ident": { "type": "keyword" }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "work_id": { "type": "keyword" }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default" }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, - "release_year": { "type": "integer" }, - "release_type": { "type": "keyword" }, - "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword" }, - "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "axiv_id": { "type": "keyword" }, - "jstor_id": { "type": "keyword" }, - "ark_id": { "type": "keyword" }, - "mag_id": { "type": "keyword" }, - "license": { "type": "keyword" }, + "release_year": { "type": "integer", "copy_to": "biblio" }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "language": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "country_code_upper": { "type": "keyword", "normalizer": "caseSensitive" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "doi": { "type": "keyword", "normalizer": "default" }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_id": { "type": "keyword" }, - "container_issnl": { "type": "keyword" }, - "container_type": { "type": "keyword" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "creator_ids": { "type": "keyword" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "affiliation_rors": { "type": "keyword", "normalizer": "default" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, + "ref_release_ids": { "type": "keyword", "normalizer": "default" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, "any_abstract": { "type": "boolean" }, - "best_pdf_url": { "type": "keyword" }, - "ia_pdf_url": { "type": "keyword" }, + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, "is_oa": { "type": "boolean" }, + "oa_color": { "type": "keyword", "normalizer": "default" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -79,7 +109,13 @@ "in_ia_sim": { "type": "boolean" }, "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, + "is_retracted": { "type": "boolean" }, + "preservation": { "type": "keyword", "normalizer": "default" }, + "affiliation": { "type": "alias", "path": "affiliations" }, + "ror": { "type": "alias", "path": "affiliation_rors" }, + "creator_id": { "type": "alias", "path": "creator_ids" }, + "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, "date": { "type": "alias", "path": "release_date" }, @@ -90,6 +126,9 @@ "lang": { "type": "alias", "path": "language" }, "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, "release_status": { "type": "alias", "path": "release_stage" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + "retracted": { "type": "alias", "path": "is_retracted" }, "is_kept": { "type": "alias", "path": "in_kbart" } } } diff --git a/extra/stats/2020-02-19-prod-stats.json b/extra/stats/2020-02-19-prod-stats.json new file mode 100644 index 00000000..a2313233 --- /dev/null +++ b/extra/stats/2020-02-19-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":3509511,"timestamp":"2020-02-20T01:42:50.980212+00:00"}},"container":{"total":148356},"papers":{"in_kbart":60523853,"in_web":19616767,"in_web_not_kbart":8937938,"is_oa":11524180,"total":105665352},"release":{"refs_total":889522285,"total":143709455}} diff --git a/extra/stats/2020-02-19-prod-table-sizes.txt b/extra/stats/2020-02-19-prod-table-sizes.txt new file mode 100644 index 00000000..cab2b52e --- /dev/null +++ b/extra/stats/2020-02-19-prod-table-sizes.txt @@ -0,0 +1,46 @@ +Size: 476.74G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 53 GB | 43 GB | 96 GB + "public"."release_rev" | 58 GB | 33 GB | 91 GB + "public"."refs_blob" | 85 GB | 2884 MB | 88 GB + "public"."release_edit" | 14 GB | 20 GB | 34 GB + "public"."work_edit" | 13 GB | 20 GB | 34 GB + "public"."release_ident" | 9504 MB | 15 GB | 24 GB + "public"."work_ident" | 9302 MB | 15 GB | 24 GB + "public"."abstracts" | 16 GB | 1501 MB | 18 GB + "public"."file_rev_url" | 9980 MB | 3550 MB | 13 GB + "public"."work_rev" | 6038 MB | 5825 MB | 12 GB + "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB + "public"."file_rev" | 3472 MB | 5103 MB | 8574 MB + "public"."file_edit" | 2934 MB | 3959 MB | 6893 MB + "public"."release_rev_abstract" | 2402 MB | 3339 MB | 5742 MB + "public"."file_ident" | 1795 MB | 2437 MB | 4231 MB + "public"."file_rev_release" | 1651 MB | 2428 MB | 4078 MB + "public"."creator_edit" | 702 MB | 942 MB | 1643 MB + "public"."creator_rev" | 695 MB | 719 MB | 1413 MB + "public"."editgroup" | 761 MB | 404 MB | 1164 MB + "public"."creator_ident" | 474 MB | 648 MB | 1121 MB + "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB + "public"."changelog" | 218 MB | 214 MB | 432 MB + "public"."container_rev" | 75 MB | 23 MB | 98 MB + "public"."container_edit" | 25 MB | 31 MB | 56 MB + "public"."container_ident" | 11 MB | 19 MB | 30 MB + "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."auth_oidc" | 16 kB | 48 kB | 64 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."editor" | 16 kB | 48 kB | 64 kB + "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB + "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) |