diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-03-27 00:04:44 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-03-27 00:04:44 +0100 |
commit | 460e2b517b663c9980693a49f66f2c1939bfbc4a (patch) | |
tree | 611c1cf53379336e5531dedac870fc07a2335272 | |
parent | e82e7a4bc83608efaa421e2e2a4f3dd302cefb81 (diff) | |
download | refcat-460e2b517b663c9980693a49f66f2c1939bfbc4a.tar.gz refcat-460e2b517b663c9980693a49f66f2c1939bfbc4a.zip |
wip: es test run
-rw-r--r-- | extra/elasticsearch/README.md | 3 | ||||
-rw-r--r-- | extra/elasticsearch/auto_mapping.json | 95 | ||||
-rw-r--r-- | extra/elasticsearch/fatcat_ref.json | 140 | ||||
-rw-r--r-- | extra/elasticsearch/sample20.json | 20 |
4 files changed, 258 insertions, 0 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md new file mode 100644 index 0000000..e9c20a2 --- /dev/null +++ b/extra/elasticsearch/README.md @@ -0,0 +1,3 @@ +# ES Schema Notes + +* schema will live in [https://git.archive.org/webgroup/fatcat/-/tree/master/extra/elasticsearch](https://git.archive.org/webgroup/fatcat/-/tree/master/extra/elasticsearch) diff --git a/extra/elasticsearch/auto_mapping.json b/extra/elasticsearch/auto_mapping.json new file mode 100644 index 0000000..72b43d7 --- /dev/null +++ b/extra/elasticsearch/auto_mapping.json @@ -0,0 +1,95 @@ +{ + "ref": { + "mappings": { + "properties": { + "match_provenance": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "match_reason": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "match_status": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "ref_index": { + "type": "long" + }, + "ref_key": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "source_release_ident": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "source_work_ident": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "source_year": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "target_release_ident": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "target_work_ident": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "update_ts": { + "type": "long" + } + } + } + } +} diff --git a/extra/elasticsearch/fatcat_ref.json b/extra/elasticsearch/fatcat_ref.json new file mode 100644 index 0000000..f4505ec --- /dev/null +++ b/extra/elasticsearch/fatcat_ref.json @@ -0,0 +1,140 @@ +{ + "ref": { + "settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "asciifolding" + ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ + "icu_normalizer" + ], + "filter": [ + "icu_folding" + ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ + "icu_normalizer" + ], + "filter": [ + "icu_folding" + ] + } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": [ + "lowercase" + ] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } + } + } + } + }, + "mappings": { + "properties": { + "update_ts": { + "type": "date" + }, + "source_release_ident": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "source_work_ident": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "source_wikipedia_article": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "source_release_stage": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "source_release_year": { + "type": "integer" + }, + "ref_index": { + "type": "integer" + }, + "ref_key": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "ref_locator": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "target_release_ident": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "target_work_ident": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "target_openlibrary_work": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "target_url_surt": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "match_provenance": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "match_status": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "match_reason": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "target_unstructured": { + "type": "keyword", + "normalizer": "default", + "doc_values": false + }, + "target_csl": { + "type": "flattened" + } + } + } + } +} diff --git a/extra/elasticsearch/sample20.json b/extra/elasticsearch/sample20.json new file mode 100644 index 0000000..afe1508 --- /dev/null +++ b/extra/elasticsearch/sample20.json @@ -0,0 +1,20 @@ +{"_id":"djulycilxfegzmmf3oud2ctt3e_34","update_ts":1616550415,"source_release_ident":"djulycilxfegzmmf3oud2ctt3e","source_work_ident":"5r6wyyk2szfbhatxdhijqrjife","source_year":"2018","ref_index":34,"ref_key":"CIT0034","target_release_ident":"tcvffh4gfre5nadizpf4f2pcgm","target_work_ident":"hnl4df4vtfcrjbwdv652bhz7ky","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"qfln4z2rjbh25bjkdtgqfgbr5y_7","update_ts":1616550415,"source_release_ident":"qfln4z2rjbh25bjkdtgqfgbr5y","source_work_ident":"6tvnvlstyjbwlncoyo3y7eobfu","source_year":"2015","ref_index":7,"ref_key":"b6","target_release_ident":"fr2xflnbmfff3gx2vrxywllmp4","target_work_ident":"uyzsagyjd5gytoltooagyqy66u","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"cql2z4z2mzdtne26lbdclaf2mi_18","update_ts":1616550415,"source_release_ident":"cql2z4z2mzdtne26lbdclaf2mi","source_work_ident":"e3r7czvqhzbjzny2h2y5r4j76y","source_year":"2009","ref_index":18,"ref_key":"10.1111/j.1538-7836.2009.03685.x-BIB18|cit18","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"dy5zvfjrkfbebpl27tuzz5dkke_53","update_ts":1616550415,"source_release_ident":"dy5zvfjrkfbebpl27tuzz5dkke","source_work_ident":"k6hyfajpxndbzby6xr54ljbeeq","source_year":"2020","ref_index":53,"ref_key":"ref53","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"ert6zvqfc5aa3nsidktm3j55xu_10","update_ts":1616550415,"source_release_ident":"ert6zvqfc5aa3nsidktm3j55xu","source_work_ident":"gmje4uz7urdwbpg7h3ydpevkiu","source_year":"2016","ref_index":10,"ref_key":"11_21614868","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"h26pp6hosbcjjicho566uol5ge_31","update_ts":1616550415,"source_release_ident":"h26pp6hosbcjjicho566uol5ge","source_work_ident":"2yaek634q5goliam2rhjh2jr4m","source_year":"2008","ref_index":31,"ref_key":"31_21614868","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"hqyj3qgbk5gujfzue6jcaukr4y_38","update_ts":1616550415,"source_release_ident":"hqyj3qgbk5gujfzue6jcaukr4y","source_work_ident":"rmfk7t2lqbc6tckzvr3egsdwmu","source_year":"2018","ref_index":38,"ref_key":"2019013114251036000_133.5.425.38","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"iqnerhknrba6xexvjjpho65oee_33","update_ts":1616550415,"source_release_ident":"iqnerhknrba6xexvjjpho65oee","source_work_ident":"sxo2qxvnrrcxxdk4o7hehynpd4","source_year":"2013","ref_index":33,"ref_key":"b32","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"jakn3vnqkzgmlnahq2czeiuh6e_23","update_ts":1616550415,"source_release_ident":"jakn3vnqkzgmlnahq2czeiuh6e","source_work_ident":"l3xvnn6njjg7jazuacyaa3euey","source_year":"2011","ref_index":23,"ref_key":"10.1111/j.1741-6612.2011.00557.x-BIB23|cit23","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"mdh2udnpunae7nk7wpxsgcn5sa_86","update_ts":1616550415,"source_release_ident":"mdh2udnpunae7nk7wpxsgcn5sa","source_work_ident":"tkzanif3nbapvlvabuyhzhobxu","source_year":"2011","ref_index":86,"ref_key":"10.1002/9781118067178.ch13-BIB86|cit86","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"q3g4a7fcknhnfm36fmoopbegjq_38","update_ts":1616550415,"source_release_ident":"q3g4a7fcknhnfm36fmoopbegjq","source_work_ident":"i3vudv5nlbeyjplqat2me3h5iy","source_year":"2011","ref_index":38,"ref_key":"182_CR38","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"r63vltdsqjepbjas4fyjsw7eam_10","update_ts":1616550415,"source_release_ident":"r63vltdsqjepbjas4fyjsw7eam","source_work_ident":"hv7rfaq7tvfonpofl47an5inpq","source_year":"2019","ref_index":10,"ref_key":"2019030813540290000_3.5.789.10","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"tcbtkoboknfmbixg2cyqb22b54_50","update_ts":1616550415,"source_release_ident":"tcbtkoboknfmbixg2cyqb22b54","source_work_ident":"6x65q5qc5vebfaufqir3kjjjta","source_year":"2007","ref_index":50,"ref_key":"p_53","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"w2xbqexh5jhm5cxiiycs2jxzuu_3","update_ts":1616550415,"source_release_ident":"w2xbqexh5jhm5cxiiycs2jxzuu","source_work_ident":"rcyeidrbffd4zkgmj73dydigyu","source_year":"2006","ref_index":3,"ref_key":"b3_182","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"wiudhzpow5hsbit5iryfinzzoe_9","update_ts":1616550415,"source_release_ident":"wiudhzpow5hsbit5iryfinzzoe","source_work_ident":"ajfkocmq25g4ppimiljcjvc25e","source_year":"2008","ref_index":9,"ref_key":"211_CR9","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"x6he7os5djexfgy4jab4w6mi3u_175","update_ts":1616550415,"source_release_ident":"x6he7os5djexfgy4jab4w6mi3u","source_work_ident":"xj7zncec35hxnb5mu64rh5veci","source_year":"2018","ref_index":175,"ref_key":"2018112713541862000_2.22.3257.175","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"zruvqd3qgjg7tisdmfxodc4yre_75","update_ts":1616550415,"source_release_ident":"zruvqd3qgjg7tisdmfxodc4yre","source_work_ident":"g4hejcur2vantb5bcfcpaq5d5y","source_year":"2011","ref_index":75,"ref_key":"555_CR75","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"3vmppu3sojhx5bx6isg6jmqezm_6","update_ts":1616550415,"source_release_ident":"3vmppu3sojhx5bx6isg6jmqezm","source_work_ident":"dc7zbmqpc5dyjlk2h2v7ephexa","source_year":"2007","ref_index":6,"ref_key":"b6_6","target_release_ident":"spzazbi3yjay5njysf6cawap5a","target_work_ident":"f6zmxoup4fdmfcc2i4epryagqq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"rnm54oxocbagjea7k2o5nocv44_137","update_ts":1616550415,"source_release_ident":"rnm54oxocbagjea7k2o5nocv44","source_work_ident":"kpe3wn6uc5bmlcjcqsuk66ptrm","source_year":"2010","ref_index":137,"ref_key":"B137","target_release_ident":"spzazbi3yjay5njysf6cawap5a","target_work_ident":"f6zmxoup4fdmfcc2i4epryagqq","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"vpa2kwd4s5c45ckjzipkzfvzmq_13","update_ts":1616550415,"source_release_ident":"vpa2kwd4s5c45ckjzipkzfvzmq","source_work_ident":"hzgjcinf7vdtfa72cyvcxjzt2u","source_year":"2007","ref_index":13,"ref_key":"CIT0014","target_release_ident":"spzazbi3yjay5njysf6cawap5a","target_work_ident":"f6zmxoup4fdmfcc2i4epryagqq","match_provenance":"join","match_status":"exact","match_reason":"doi"} |