aboutsummaryrefslogtreecommitdiffstats
path: root/extra/elasticsearch
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-27 00:04:44 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-03-27 00:04:44 +0100
commit460e2b517b663c9980693a49f66f2c1939bfbc4a (patch)
tree611c1cf53379336e5531dedac870fc07a2335272 /extra/elasticsearch
parente82e7a4bc83608efaa421e2e2a4f3dd302cefb81 (diff)
downloadrefcat-460e2b517b663c9980693a49f66f2c1939bfbc4a.tar.gz
refcat-460e2b517b663c9980693a49f66f2c1939bfbc4a.zip
wip: es test run
Diffstat (limited to 'extra/elasticsearch')
-rw-r--r--extra/elasticsearch/README.md3
-rw-r--r--extra/elasticsearch/auto_mapping.json95
-rw-r--r--extra/elasticsearch/fatcat_ref.json140
-rw-r--r--extra/elasticsearch/sample20.json20
4 files changed, 258 insertions, 0 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
new file mode 100644
index 0000000..e9c20a2
--- /dev/null
+++ b/extra/elasticsearch/README.md
@@ -0,0 +1,3 @@
+# ES Schema Notes
+
+* schema will live in [https://git.archive.org/webgroup/fatcat/-/tree/master/extra/elasticsearch](https://git.archive.org/webgroup/fatcat/-/tree/master/extra/elasticsearch)
diff --git a/extra/elasticsearch/auto_mapping.json b/extra/elasticsearch/auto_mapping.json
new file mode 100644
index 0000000..72b43d7
--- /dev/null
+++ b/extra/elasticsearch/auto_mapping.json
@@ -0,0 +1,95 @@
+{
+ "ref": {
+ "mappings": {
+ "properties": {
+ "match_provenance": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "match_reason": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "match_status": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "ref_index": {
+ "type": "long"
+ },
+ "ref_key": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "source_release_ident": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "source_work_ident": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "source_year": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "target_release_ident": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "target_work_ident": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 256
+ }
+ }
+ },
+ "update_ts": {
+ "type": "long"
+ }
+ }
+ }
+ }
+}
diff --git a/extra/elasticsearch/fatcat_ref.json b/extra/elasticsearch/fatcat_ref.json
new file mode 100644
index 0000000..f4505ec
--- /dev/null
+++ b/extra/elasticsearch/fatcat_ref.json
@@ -0,0 +1,140 @@
+{
+ "ref": {
+ "settings": {
+ "index": {
+ "analysis": {
+ "analyzer": {
+ "default": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase",
+ "asciifolding"
+ ]
+ },
+ "textIcu": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [
+ "icu_normalizer"
+ ],
+ "filter": [
+ "icu_folding"
+ ]
+ },
+ "textIcuSearch": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [
+ "icu_normalizer"
+ ],
+ "filter": [
+ "icu_folding"
+ ]
+ }
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": [
+ "lowercase"
+ ]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
+ }
+ }
+ }
+ },
+ "mappings": {
+ "properties": {
+ "update_ts": {
+ "type": "date"
+ },
+ "source_release_ident": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "source_work_ident": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "source_wikipedia_article": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "source_release_stage": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "source_release_year": {
+ "type": "integer"
+ },
+ "ref_index": {
+ "type": "integer"
+ },
+ "ref_key": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "ref_locator": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "target_release_ident": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "target_work_ident": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "target_openlibrary_work": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "target_url_surt": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "match_provenance": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "match_status": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "match_reason": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "target_unstructured": {
+ "type": "keyword",
+ "normalizer": "default",
+ "doc_values": false
+ },
+ "target_csl": {
+ "type": "flattened"
+ }
+ }
+ }
+ }
+}
diff --git a/extra/elasticsearch/sample20.json b/extra/elasticsearch/sample20.json
new file mode 100644
index 0000000..afe1508
--- /dev/null
+++ b/extra/elasticsearch/sample20.json
@@ -0,0 +1,20 @@
+{"_id":"djulycilxfegzmmf3oud2ctt3e_34","update_ts":1616550415,"source_release_ident":"djulycilxfegzmmf3oud2ctt3e","source_work_ident":"5r6wyyk2szfbhatxdhijqrjife","source_year":"2018","ref_index":34,"ref_key":"CIT0034","target_release_ident":"tcvffh4gfre5nadizpf4f2pcgm","target_work_ident":"hnl4df4vtfcrjbwdv652bhz7ky","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"qfln4z2rjbh25bjkdtgqfgbr5y_7","update_ts":1616550415,"source_release_ident":"qfln4z2rjbh25bjkdtgqfgbr5y","source_work_ident":"6tvnvlstyjbwlncoyo3y7eobfu","source_year":"2015","ref_index":7,"ref_key":"b6","target_release_ident":"fr2xflnbmfff3gx2vrxywllmp4","target_work_ident":"uyzsagyjd5gytoltooagyqy66u","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"cql2z4z2mzdtne26lbdclaf2mi_18","update_ts":1616550415,"source_release_ident":"cql2z4z2mzdtne26lbdclaf2mi","source_work_ident":"e3r7czvqhzbjzny2h2y5r4j76y","source_year":"2009","ref_index":18,"ref_key":"10.1111/j.1538-7836.2009.03685.x-BIB18|cit18","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"dy5zvfjrkfbebpl27tuzz5dkke_53","update_ts":1616550415,"source_release_ident":"dy5zvfjrkfbebpl27tuzz5dkke","source_work_ident":"k6hyfajpxndbzby6xr54ljbeeq","source_year":"2020","ref_index":53,"ref_key":"ref53","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"ert6zvqfc5aa3nsidktm3j55xu_10","update_ts":1616550415,"source_release_ident":"ert6zvqfc5aa3nsidktm3j55xu","source_work_ident":"gmje4uz7urdwbpg7h3ydpevkiu","source_year":"2016","ref_index":10,"ref_key":"11_21614868","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"h26pp6hosbcjjicho566uol5ge_31","update_ts":1616550415,"source_release_ident":"h26pp6hosbcjjicho566uol5ge","source_work_ident":"2yaek634q5goliam2rhjh2jr4m","source_year":"2008","ref_index":31,"ref_key":"31_21614868","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"hqyj3qgbk5gujfzue6jcaukr4y_38","update_ts":1616550415,"source_release_ident":"hqyj3qgbk5gujfzue6jcaukr4y","source_work_ident":"rmfk7t2lqbc6tckzvr3egsdwmu","source_year":"2018","ref_index":38,"ref_key":"2019013114251036000_133.5.425.38","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"iqnerhknrba6xexvjjpho65oee_33","update_ts":1616550415,"source_release_ident":"iqnerhknrba6xexvjjpho65oee","source_work_ident":"sxo2qxvnrrcxxdk4o7hehynpd4","source_year":"2013","ref_index":33,"ref_key":"b32","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"jakn3vnqkzgmlnahq2czeiuh6e_23","update_ts":1616550415,"source_release_ident":"jakn3vnqkzgmlnahq2czeiuh6e","source_work_ident":"l3xvnn6njjg7jazuacyaa3euey","source_year":"2011","ref_index":23,"ref_key":"10.1111/j.1741-6612.2011.00557.x-BIB23|cit23","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"mdh2udnpunae7nk7wpxsgcn5sa_86","update_ts":1616550415,"source_release_ident":"mdh2udnpunae7nk7wpxsgcn5sa","source_work_ident":"tkzanif3nbapvlvabuyhzhobxu","source_year":"2011","ref_index":86,"ref_key":"10.1002/9781118067178.ch13-BIB86|cit86","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"q3g4a7fcknhnfm36fmoopbegjq_38","update_ts":1616550415,"source_release_ident":"q3g4a7fcknhnfm36fmoopbegjq","source_work_ident":"i3vudv5nlbeyjplqat2me3h5iy","source_year":"2011","ref_index":38,"ref_key":"182_CR38","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"r63vltdsqjepbjas4fyjsw7eam_10","update_ts":1616550415,"source_release_ident":"r63vltdsqjepbjas4fyjsw7eam","source_work_ident":"hv7rfaq7tvfonpofl47an5inpq","source_year":"2019","ref_index":10,"ref_key":"2019030813540290000_3.5.789.10","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"tcbtkoboknfmbixg2cyqb22b54_50","update_ts":1616550415,"source_release_ident":"tcbtkoboknfmbixg2cyqb22b54","source_work_ident":"6x65q5qc5vebfaufqir3kjjjta","source_year":"2007","ref_index":50,"ref_key":"p_53","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"w2xbqexh5jhm5cxiiycs2jxzuu_3","update_ts":1616550415,"source_release_ident":"w2xbqexh5jhm5cxiiycs2jxzuu","source_work_ident":"rcyeidrbffd4zkgmj73dydigyu","source_year":"2006","ref_index":3,"ref_key":"b3_182","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"wiudhzpow5hsbit5iryfinzzoe_9","update_ts":1616550415,"source_release_ident":"wiudhzpow5hsbit5iryfinzzoe","source_work_ident":"ajfkocmq25g4ppimiljcjvc25e","source_year":"2008","ref_index":9,"ref_key":"211_CR9","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"x6he7os5djexfgy4jab4w6mi3u_175","update_ts":1616550415,"source_release_ident":"x6he7os5djexfgy4jab4w6mi3u","source_work_ident":"xj7zncec35hxnb5mu64rh5veci","source_year":"2018","ref_index":175,"ref_key":"2018112713541862000_2.22.3257.175","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"zruvqd3qgjg7tisdmfxodc4yre_75","update_ts":1616550415,"source_release_ident":"zruvqd3qgjg7tisdmfxodc4yre","source_work_ident":"g4hejcur2vantb5bcfcpaq5d5y","source_year":"2011","ref_index":75,"ref_key":"555_CR75","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"3vmppu3sojhx5bx6isg6jmqezm_6","update_ts":1616550415,"source_release_ident":"3vmppu3sojhx5bx6isg6jmqezm","source_work_ident":"dc7zbmqpc5dyjlk2h2v7ephexa","source_year":"2007","ref_index":6,"ref_key":"b6_6","target_release_ident":"spzazbi3yjay5njysf6cawap5a","target_work_ident":"f6zmxoup4fdmfcc2i4epryagqq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"rnm54oxocbagjea7k2o5nocv44_137","update_ts":1616550415,"source_release_ident":"rnm54oxocbagjea7k2o5nocv44","source_work_ident":"kpe3wn6uc5bmlcjcqsuk66ptrm","source_year":"2010","ref_index":137,"ref_key":"B137","target_release_ident":"spzazbi3yjay5njysf6cawap5a","target_work_ident":"f6zmxoup4fdmfcc2i4epryagqq","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"vpa2kwd4s5c45ckjzipkzfvzmq_13","update_ts":1616550415,"source_release_ident":"vpa2kwd4s5c45ckjzipkzfvzmq","source_work_ident":"hzgjcinf7vdtfa72cyvcxjzt2u","source_year":"2007","ref_index":13,"ref_key":"CIT0014","target_release_ident":"spzazbi3yjay5njysf6cawap5a","target_work_ident":"f6zmxoup4fdmfcc2i4epryagqq","match_provenance":"join","match_status":"exact","match_reason":"doi"}