From daef08aec3f2297b972feef0cfc84ffd52e313e5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 30 Apr 2021 11:06:18 -0700 Subject: search: add scholar index; improve queries for file and container --- fatcat-cli/src/commands.rs | 17 ++++++ fatcat-cli/src/lib.rs | 2 + fatcat-cli/src/main.rs | 3 + fatcat-cli/src/search.rs | 142 ++++++++++++++++++++++++++++++++++----------- 4 files changed, 131 insertions(+), 33 deletions(-) diff --git a/fatcat-cli/src/commands.rs b/fatcat-cli/src/commands.rs index eb7d5a8..6f52614 100644 --- a/fatcat-cli/src/commands.rs +++ b/fatcat-cli/src/commands.rs @@ -292,6 +292,9 @@ pub fn print_search_table(results: SearchResults, entity_type: SearchEntityType) SearchEntityType::File => { writeln!(tw, "ident\tsha1\tsize_bytes\tmimetype")?; } + SearchEntityType::Scholar=> { + writeln!(tw, "key\ttype\tstage\tyear\tcontainer_name\ttitle")?; + } } for hit in results { let hit = hit?; @@ -331,6 +334,20 @@ pub fn print_search_table(results: SearchResults, entity_type: SearchEntityType) hit["mimetype"].as_str().unwrap_or("-"), )?; } + SearchEntityType::Scholar => { + writeln!( + tw, + "{}\t{}\t{}\t{}\t{}\t{}", + hit["key"].as_str().unwrap_or("-"), + hit["biblio"]["release_type"].as_str().unwrap_or("-"), + hit["biblio"]["release_stage"].as_str().unwrap_or("-"), + hit["biblio"]["release_year"] + .as_u64() + .map_or("-".to_string(), |v| v.to_string()), + hit["biblio"]["container_name"].as_str().unwrap_or("-"), + hit["biblio"]["title"].as_str().unwrap_or("-"), + )?; + } } } tw.flush()?; diff --git a/fatcat-cli/src/lib.rs b/fatcat-cli/src/lib.rs index d83ee76..6fd7b61 100644 --- a/fatcat-cli/src/lib.rs +++ b/fatcat-cli/src/lib.rs @@ -57,6 +57,7 @@ pub enum SearchEntityType { Release, Container, File, + Scholar, } impl FromStr for SearchEntityType { @@ -67,6 +68,7 @@ impl FromStr for SearchEntityType { "release" | "releases" => Ok(SearchEntityType::Release), "container" | "containers" => Ok(SearchEntityType::Container), "file" | "files" => Ok(SearchEntityType::File), + "scholar" | "fulltext" => Ok(SearchEntityType::Scholar), _ => Err(anyhow!("invalid entity type : {}", s)), } } diff --git a/fatcat-cli/src/main.rs b/fatcat-cli/src/main.rs index 15301b5..317a7c3 100644 --- a/fatcat-cli/src/main.rs +++ b/fatcat-cli/src/main.rs @@ -659,6 +659,9 @@ fn run(opt: Opt) -> Result<()> { )?; writeln!(&mut std::io::stdout(), "{}", entity.to_json_string()?)? } + (false, true, SearchEntityType::Scholar) => { + return Err(anyhow!("entity schema output not supported for scholar index")); + } } } } diff --git a/fatcat-cli/src/search.rs b/fatcat-cli/src/search.rs index e4f7dce..7d03f6f 100644 --- a/fatcat-cli/src/search.rs +++ b/fatcat-cli/src/search.rs @@ -79,6 +79,7 @@ pub fn crude_search( SearchEntityType::Release => "fatcat_release", SearchEntityType::Container => "fatcat_container", SearchEntityType::File => "fatcat_file", + SearchEntityType::Scholar => "scholar_fulltext", }; let http_client = reqwest::blocking::Client::builder() .timeout(Duration::from_secs(10)) @@ -102,47 +103,122 @@ pub fn crude_search( Some(l) => (false, "_score", l), }; - let query_body = json!({ - "query": { - "boosting": { - "positive": { - "bool": { - "must": { - "query_string": { - "query": query, - "default_operator": "AND", - "analyze_wildcard": true, - "allow_leading_wildcard": false, - "lenient": true, - "fields": [ - "title^2", - "biblio", - ], + let query_body = match entity_type { + SearchEntityType::Release => json!({ + "query": { + "boosting": { + "positive": { + "bool": { + "must": { + "query_string": { + "query": query, + "default_operator": "AND", + "analyze_wildcard": true, + "allow_leading_wildcard": false, + "lenient": true, + "fields": [ + "title^2", + "biblio", + ], + }, + }, + "should": { + "term": { "in_ia": true }, }, }, - "should": { - "term": { "in_ia": true }, + }, + "negative": { + "bool": { + "should": [ + {"bool": { "must_not" : { "exists": { "field": "title" }}}}, + {"bool": { "must_not" : { "exists": { "field": "year" }}}}, + {"bool": { "must_not" : { "exists": { "field": "type" }}}}, + {"bool": { "must_not" : { "exists": { "field": "stage" }}}}, + ], }, }, + "negative_boost": 0.5, + }, + }, + "size": size, + "sort": [ sort_mode ], + "track_total_hits": true, + }), + SearchEntityType::Container => json!({ + "query": { + "query_string": { + "query": query, + "default_operator": "AND", + "analyze_wildcard": true, + "allow_leading_wildcard": false, + "lenient": true, + "fields": [ + "name^2", + "biblio", + ], }, - "negative": { - "bool": { - "should": [ - {"bool": { "must_not" : { "exists": { "field": "title" }}}}, - {"bool": { "must_not" : { "exists": { "field": "year" }}}}, - {"bool": { "must_not" : { "exists": { "field": "type" }}}}, - {"bool": { "must_not" : { "exists": { "field": "stage" }}}}, - ], + }, + "size": size, + "sort": [ sort_mode ], + "track_total_hits": true, + }), + SearchEntityType::File => json!({ + "query": { + "query_string": { + "query": query, + "default_operator": "AND", + "analyze_wildcard": true, + "allow_leading_wildcard": false, + "lenient": true, + }, + }, + "size": size, + "sort": [ sort_mode ], + "track_total_hits": true, + }), + SearchEntityType::Scholar => json!({ + "query": { + "boosting": { + "positive": { + "bool": { + "must": { + "query_string": { + "query": query, + "default_operator": "AND", + "analyze_wildcard": true, + "allow_leading_wildcard": false, + "lenient": true, + "quote_field_suffix": ".exact", + "fields": [ + "title^4", + "biblio_all^3", + "everything", + ], + }, + }, + "should": { + "terms": { "access_type": ["ia_sim", "ia_file", "wayback"]}, + }, + }, }, + "negative": { + "bool": { + "should": [ + {"bool": { "must_not" : { "exists": { "field": "year" }}}}, + {"bool": { "must_not" : { "exists": { "field": "type" }}}}, + {"bool": { "must_not" : { "exists": { "field": "stage" }}}}, + {"bool": { "must_not" : { "exists": { "field": "biblio.container_name" }}}}, + ], + }, + }, + "negative_boost": 0.5, }, - "negative_boost": 0.5, }, - }, - "size": size, - "sort": [ sort_mode ], - "track_total_hits": true, - }) - .to_string(); + "size": size, + "sort": [ sort_mode ], + "track_total_hits": true, + }), + }.to_string(); let mut request = http_client .get(&request_url) -- cgit v1.2.3