summaryrefslogtreecommitdiffstats
path: root/extra/elasticsearch
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-02-26 22:05:43 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-02-26 22:05:43 -0800
commitae50ee2274031ddc178fa4a10b59280e8440a24c (patch)
tree5a17d5d5f875ab6ff770c955b25626c6f36d16a8 /extra/elasticsearch
parent1556cdd7f0f5f4bc4fe5ccc9764c1598c852bb9b (diff)
parent81e0784813500a39955c20278140e25d7940d9c6 (diff)
downloadfatcat-ae50ee2274031ddc178fa4a10b59280e8440a24c.tar.gz
fatcat-ae50ee2274031ddc178fa4a10b59280e8440a24c.zip
Merge branch 'bnewbold-elastic-v03b'
Diffstat (limited to 'extra/elasticsearch')
-rw-r--r--extra/elasticsearch/README.md7
-rw-r--r--extra/elasticsearch/changelog_schema.json29
-rw-r--r--extra/elasticsearch/container_schema.json68
-rw-r--r--extra/elasticsearch/file_schema.json59
-rw-r--r--extra/elasticsearch/release_schema.json101
5 files changed, 203 insertions, 61 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 3a48a178..df4cb918 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -40,9 +40,11 @@ Drop and rebuild the schema:
http delete :9200/fatcat_release
http delete :9200/fatcat_container
+ http delete :9200/fatcat_file
http delete :9200/fatcat_changelog
http put :9200/fatcat_release < release_schema.json
http put :9200/fatcat_container < container_schema.json
+ http put :9200/fatcat_file < file_schema.json
http put :9200/fatcat_changelog < changelog_schema.json
Put a single object (good for debugging):
@@ -57,8 +59,9 @@ Bulk insert from a file on disk:
Or, in a bulk production live-stream conversion:
export LC_ALL=C.UTF-8
- time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_release -type release
- time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_container -type container
+ time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_release -type release
+ time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_container -type container
+ time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_file -type file
## Index Aliases
diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index f3211e99..d8342549 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -8,6 +8,18 @@
"tokenizer": "standard",
"filter": [ "lowercase", "asciifolding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -16,20 +28,29 @@
"changelog": {
"properties": {
"index": { "type": "integer" },
- "editgroup_id": { "type": "keyword" },
+ "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"timestamp": { "type": "date" },
- "editor_id": { "type": "keyword" },
- "username": { "type": "keyword" },
+ "editor_id": { "type": "keyword", "normalizer": "default" },
+ "username": { "type": "keyword", "normalizer": "caseSensitive" },
"is_bot": { "type": "boolean" },
"is_admin": { "type": "boolean" },
- "agent": { "type": "keyword" },
+ "agent": { "type": "keyword", "normalizer": "caseSensitive" },
+
"containers": { "type": "integer" },
+ "new_containers": { "type": "integer" },
"creators": { "type": "integer" },
+ "new_creators": { "type": "integer" },
"files": { "type": "integer" },
+ "new_files": { "type": "integer" },
"filessets": { "type": "integer" },
+ "new_filessets": { "type": "integer" },
"webcaptures": { "type": "integer" },
+ "new_webcaptures": { "type": "integer" },
"releases": { "type": "integer" },
+ "new_releases": { "type": "integer" },
"works": { "type": "integer" },
+ "new_works": { "type": "integer" },
+
"created": { "type": "integer" },
"updated": { "type": "integer" },
"deleted": { "type": "integer" },
diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index b0a47e85..5cd85b04 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -20,6 +20,18 @@
"char_filter": [ "icu_normalizer" ],
"filter": [ "icu_folding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -27,43 +39,51 @@
"mappings": {
"container": {
"properties": {
- "ident": { "type": "keyword" },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword" },
- "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_type": { "type": "keyword" },
- "issnl": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
- "country": { "type": "keyword" },
- "region": { "type": "keyword" },
- "discipline": { "type": "keyword" },
- "languages": { "type": "keyword" },
- "mimetypes": { "type": "keyword" },
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "publisher_type": { "type": "keyword", "normalizer": "default" },
+ "container_type": { "type": "keyword", "normalizer": "default" },
+ "issnl": { "type": "keyword", "normalizer": "default" },
+ "issns": { "type": "keyword", "normalizer": "default" },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default" },
+ "country_code": { "type": "keyword", "normalizer": "default" },
+ "region": { "type": "keyword", "normalizer": "default" },
+ "discipline": { "type": "keyword", "normalizer": "default" },
+ "languages": { "type": "keyword", "normalizer": "default" },
+ "mimetypes": { "type": "keyword", "normalizer": "default" },
"first_year": { "type": "integer" },
"last_year": { "type": "integer" },
- "in_doaj": { "type": "boolean" },
- "in_road": { "type": "boolean" },
- "in_doi": { "type": "boolean" },
- "in_sherpa_romeo":{ "type": "boolean" },
- "is_oa": { "type": "boolean" },
- "is_longtail_oa": { "type": "boolean" },
- "any_kbart": { "type": "boolean" },
- "any_jstor": { "type": "boolean" },
- "any_ia_sim": { "type": "boolean" },
+
+ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+ "in_doaj": { "type": "boolean" },
+ "in_road": { "type": "boolean" },
+ "is_oa": { "type": "boolean" },
+ "is_longtail_oa": { "type": "boolean" },
+ "any_kbart": { "type": "boolean" },
+ "any_jstor": { "type": "boolean" },
+ "any_ia_sim": { "type": "boolean" },
+ "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" },
"releases_total": { "type": "integer" },
"releases_kbart": { "type": "integer" },
"releases_ia": { "type": "integer" },
- "releases_sim": { "type": "integer" },
- "releases_shadow": { "type": "integer" },
+ "releases_ia_sim": { "type": "integer" },
+ "releases_shadows": { "type": "integer" },
"releases_any_file": { "type": "integer" },
"releases_any_fileset": { "type": "integer" },
"releases_any_webcapture": { "type": "integer" },
"year": { "type": "alias", "path": "first_year" },
"type": { "type": "alias", "path": "container_type" },
+ "issn": { "type": "alias", "path": "issns" },
"oa": { "type": "alias", "path": "is_oa" },
"longtail": { "type": "alias", "path": "is_longtail_oa" }
}
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
new file mode 100644
index 00000000..9c8ee64c
--- /dev/null
+++ b/extra/elasticsearch/file_schema.json
@@ -0,0 +1,59 @@
+{
+"settings": {
+ "index": {
+ "analysis": {
+ "analyzer": {
+ "default": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [ "lowercase", "asciifolding" ]
+ }
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
+ }
+ }
+ }
+},
+"mappings": {
+ "file": {
+ "properties": {
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+
+ "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "release_count": { "type": "integer" },
+ "mimetype": { "type": "keyword", "normalizer": "default" },
+ "size_bytes": { "type": "integer" },
+ "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "md5": { "type": "keyword", "normalizer": "default", "doc_values": false },
+
+ "domains": { "type": "keyword", "normalizer": "default" },
+ "hosts": { "type": "keyword", "normalizer": "default" },
+ "rels": { "type": "keyword", "normalizer": "default" },
+ "in_ia": { "type": "boolean" },
+ "in_ia_petabox": { "type": "boolean" },
+
+ "release_id": { "type": "alias", "path": "release_ids" },
+ "sha1hex": { "type": "alias", "path": "sha1" },
+ "sha256hex": { "type": "alias", "path": "sha256" },
+ "md5hex": { "type": "alias", "path": "md5" },
+ "size": { "type": "alias", "path": "size_bytes" },
+ "domain": { "type": "alias", "path": "domains" },
+ "host": { "type": "alias", "path": "hosts" },
+ "rel": { "type": "alias", "path": "rels" }
+ }
+ }
+}
+}
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 85026060..666a672f 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -20,6 +20,18 @@
"char_filter": [ "icu_normalizer" ],
"filter": [ "icu_folding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -27,48 +39,66 @@
"mappings": {
"release": {
"properties": {
- "ident": { "type": "keyword" },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword" },
- "work_id": { "type": "keyword" },
- "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "work_id": { "type": "keyword", "normalizer": "default" },
+ "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"release_date": { "type": "date" },
- "release_year": { "type": "integer" },
- "release_type": { "type": "keyword" },
- "release_stage": { "type": "keyword" },
- "withdrawn_status": { "type": "keyword" },
- "language": { "type": "keyword" },
- "doi": { "type": "keyword" },
- "pmid": { "type": "keyword" },
- "pmcid": { "type": "keyword" },
- "isbn13": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
- "core_id": { "type": "keyword" },
- "axiv_id": { "type": "keyword" },
- "jstor_id": { "type": "keyword" },
- "ark_id": { "type": "keyword" },
- "mag_id": { "type": "keyword" },
- "license": { "type": "keyword" },
+ "release_year": { "type": "integer", "copy_to": "biblio" },
+ "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "release_stage": { "type": "keyword", "normalizer": "default" },
+ "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "language": { "type": "keyword", "normalizer": "default" },
+ "country_code": { "type": "keyword", "normalizer": "default" },
+ "country_code_upper": { "type": "keyword", "normalizer": "caseSensitive" },
+ "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "first_page": { "type": "keyword", "normalizer": "default" },
+ "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "doi": { "type": "keyword", "normalizer": "default" },
+ "doi_prefix": { "type": "keyword", "normalizer": "default" },
+ "doi_registrar": { "type": "keyword", "normalizer": "default" },
+ "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "license": { "type": "keyword", "normalizer": "default" },
"publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_id": { "type": "keyword" },
- "container_issnl": { "type": "keyword" },
- "container_type": { "type": "keyword" },
+ "publisher_type": { "type": "keyword", "normalizer": "default" },
+ "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "container_id": { "type": "keyword", "normalizer": "default" },
+ "container_issnl": { "type": "keyword", "normalizer": "default" },
+ "container_type": { "type": "keyword", "normalizer": "default" },
"contrib_count": { "type": "integer" },
- "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "creator_ids": { "type": "keyword" },
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "affiliation_rors": { "type": "keyword", "normalizer": "default" },
+ "creator_ids": { "type": "keyword", "normalizer": "default" },
"ref_count": { "type": "integer" },
"ref_linked_count": { "type": "integer" },
+ "ref_release_ids": { "type": "keyword", "normalizer": "default" },
"file_count": { "type": "integer" },
"fileset_count": { "type": "integer" },
"webcapture_count": { "type": "integer" },
"any_abstract": { "type": "boolean" },
- "best_pdf_url": { "type": "keyword" },
- "ia_pdf_url": { "type": "keyword" },
+ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+ "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
+ "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
+ "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
"is_oa": { "type": "boolean" },
+ "oa_color": { "type": "keyword", "normalizer": "default" },
"is_longtail_oa": { "type": "boolean" },
"is_preserved": { "type": "boolean" },
"in_kbart": { "type": "boolean" },
@@ -79,7 +109,13 @@
"in_ia_sim": { "type": "boolean" },
"in_shadows": { "type": "boolean" },
"is_superceded": { "type": "boolean" },
+ "is_retracted": { "type": "boolean" },
+ "preservation": { "type": "keyword", "normalizer": "default" },
+ "affiliation": { "type": "alias", "path": "affiliations" },
+ "ror": { "type": "alias", "path": "affiliation_rors" },
+ "creator_id": { "type": "alias", "path": "creator_ids" },
+ "ref_release_id": { "type": "alias", "path": "ref_release_ids" },
"author": { "type": "alias", "path": "contrib_names" },
"journal": { "type": "alias", "path": "container_name" },
"date": { "type": "alias", "path": "release_date" },
@@ -90,6 +126,9 @@
"lang": { "type": "alias", "path": "language" },
"file_pdf_url": { "type": "alias", "path": "best_pdf_url" },
"release_status": { "type": "alias", "path": "release_stage" },
+ "stage": { "type": "alias", "path": "release_stage" },
+ "type": { "type": "alias", "path": "release_type" },
+ "retracted": { "type": "alias", "path": "is_retracted" },
"is_kept": { "type": "alias", "path": "in_kbart" }
}
}