aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-04-30 11:06:18 -0700
committerBryan Newbold <bnewbold@archive.org>2021-04-30 11:06:18 -0700
commitdaef08aec3f2297b972feef0cfc84ffd52e313e5 (patch)
tree465a967120de25f3509c2f315cbbaad93b35706c
parent948f3bce44a1abf31386672de307958def67d971 (diff)
downloadfatcat-cli-daef08aec3f2297b972feef0cfc84ffd52e313e5.tar.gz
fatcat-cli-daef08aec3f2297b972feef0cfc84ffd52e313e5.zip
search: add scholar index; improve queries for file and container
-rw-r--r--fatcat-cli/src/commands.rs17
-rw-r--r--fatcat-cli/src/lib.rs2
-rw-r--r--fatcat-cli/src/main.rs3
-rw-r--r--fatcat-cli/src/search.rs142
4 files changed, 131 insertions, 33 deletions
diff --git a/fatcat-cli/src/commands.rs b/fatcat-cli/src/commands.rs
index eb7d5a8..6f52614 100644
--- a/fatcat-cli/src/commands.rs
+++ b/fatcat-cli/src/commands.rs
@@ -292,6 +292,9 @@ pub fn print_search_table(results: SearchResults, entity_type: SearchEntityType)
SearchEntityType::File => {
writeln!(tw, "ident\tsha1\tsize_bytes\tmimetype")?;
}
+ SearchEntityType::Scholar=> {
+ writeln!(tw, "key\ttype\tstage\tyear\tcontainer_name\ttitle")?;
+ }
}
for hit in results {
let hit = hit?;
@@ -331,6 +334,20 @@ pub fn print_search_table(results: SearchResults, entity_type: SearchEntityType)
hit["mimetype"].as_str().unwrap_or("-"),
)?;
}
+ SearchEntityType::Scholar => {
+ writeln!(
+ tw,
+ "{}\t{}\t{}\t{}\t{}\t{}",
+ hit["key"].as_str().unwrap_or("-"),
+ hit["biblio"]["release_type"].as_str().unwrap_or("-"),
+ hit["biblio"]["release_stage"].as_str().unwrap_or("-"),
+ hit["biblio"]["release_year"]
+ .as_u64()
+ .map_or("-".to_string(), |v| v.to_string()),
+ hit["biblio"]["container_name"].as_str().unwrap_or("-"),
+ hit["biblio"]["title"].as_str().unwrap_or("-"),
+ )?;
+ }
}
}
tw.flush()?;
diff --git a/fatcat-cli/src/lib.rs b/fatcat-cli/src/lib.rs
index d83ee76..6fd7b61 100644
--- a/fatcat-cli/src/lib.rs
+++ b/fatcat-cli/src/lib.rs
@@ -57,6 +57,7 @@ pub enum SearchEntityType {
Release,
Container,
File,
+ Scholar,
}
impl FromStr for SearchEntityType {
@@ -67,6 +68,7 @@ impl FromStr for SearchEntityType {
"release" | "releases" => Ok(SearchEntityType::Release),
"container" | "containers" => Ok(SearchEntityType::Container),
"file" | "files" => Ok(SearchEntityType::File),
+ "scholar" | "fulltext" => Ok(SearchEntityType::Scholar),
_ => Err(anyhow!("invalid entity type : {}", s)),
}
}
diff --git a/fatcat-cli/src/main.rs b/fatcat-cli/src/main.rs
index 15301b5..317a7c3 100644
--- a/fatcat-cli/src/main.rs
+++ b/fatcat-cli/src/main.rs
@@ -659,6 +659,9 @@ fn run(opt: Opt) -> Result<()> {
)?;
writeln!(&mut std::io::stdout(), "{}", entity.to_json_string()?)?
}
+ (false, true, SearchEntityType::Scholar) => {
+ return Err(anyhow!("entity schema output not supported for scholar index"));
+ }
}
}
}
diff --git a/fatcat-cli/src/search.rs b/fatcat-cli/src/search.rs
index e4f7dce..7d03f6f 100644
--- a/fatcat-cli/src/search.rs
+++ b/fatcat-cli/src/search.rs
@@ -79,6 +79,7 @@ pub fn crude_search(
SearchEntityType::Release => "fatcat_release",
SearchEntityType::Container => "fatcat_container",
SearchEntityType::File => "fatcat_file",
+ SearchEntityType::Scholar => "scholar_fulltext",
};
let http_client = reqwest::blocking::Client::builder()
.timeout(Duration::from_secs(10))
@@ -102,47 +103,122 @@ pub fn crude_search(
Some(l) => (false, "_score", l),
};
- let query_body = json!({
- "query": {
- "boosting": {
- "positive": {
- "bool": {
- "must": {
- "query_string": {
- "query": query,
- "default_operator": "AND",
- "analyze_wildcard": true,
- "allow_leading_wildcard": false,
- "lenient": true,
- "fields": [
- "title^2",
- "biblio",
- ],
+ let query_body = match entity_type {
+ SearchEntityType::Release => json!({
+ "query": {
+ "boosting": {
+ "positive": {
+ "bool": {
+ "must": {
+ "query_string": {
+ "query": query,
+ "default_operator": "AND",
+ "analyze_wildcard": true,
+ "allow_leading_wildcard": false,
+ "lenient": true,
+ "fields": [
+ "title^2",
+ "biblio",
+ ],
+ },
+ },
+ "should": {
+ "term": { "in_ia": true },
},
},
- "should": {
- "term": { "in_ia": true },
+ },
+ "negative": {
+ "bool": {
+ "should": [
+ {"bool": { "must_not" : { "exists": { "field": "title" }}}},
+ {"bool": { "must_not" : { "exists": { "field": "year" }}}},
+ {"bool": { "must_not" : { "exists": { "field": "type" }}}},
+ {"bool": { "must_not" : { "exists": { "field": "stage" }}}},
+ ],
},
},
+ "negative_boost": 0.5,
+ },
+ },
+ "size": size,
+ "sort": [ sort_mode ],
+ "track_total_hits": true,
+ }),
+ SearchEntityType::Container => json!({
+ "query": {
+ "query_string": {
+ "query": query,
+ "default_operator": "AND",
+ "analyze_wildcard": true,
+ "allow_leading_wildcard": false,
+ "lenient": true,
+ "fields": [
+ "name^2",
+ "biblio",
+ ],
},
- "negative": {
- "bool": {
- "should": [
- {"bool": { "must_not" : { "exists": { "field": "title" }}}},
- {"bool": { "must_not" : { "exists": { "field": "year" }}}},
- {"bool": { "must_not" : { "exists": { "field": "type" }}}},
- {"bool": { "must_not" : { "exists": { "field": "stage" }}}},
- ],
+ },
+ "size": size,
+ "sort": [ sort_mode ],
+ "track_total_hits": true,
+ }),
+ SearchEntityType::File => json!({
+ "query": {
+ "query_string": {
+ "query": query,
+ "default_operator": "AND",
+ "analyze_wildcard": true,
+ "allow_leading_wildcard": false,
+ "lenient": true,
+ },
+ },
+ "size": size,
+ "sort": [ sort_mode ],
+ "track_total_hits": true,
+ }),
+ SearchEntityType::Scholar => json!({
+ "query": {
+ "boosting": {
+ "positive": {
+ "bool": {
+ "must": {
+ "query_string": {
+ "query": query,
+ "default_operator": "AND",
+ "analyze_wildcard": true,
+ "allow_leading_wildcard": false,
+ "lenient": true,
+ "quote_field_suffix": ".exact",
+ "fields": [
+ "title^4",
+ "biblio_all^3",
+ "everything",
+ ],
+ },
+ },
+ "should": {
+ "terms": { "access_type": ["ia_sim", "ia_file", "wayback"]},
+ },
+ },
},
+ "negative": {
+ "bool": {
+ "should": [
+ {"bool": { "must_not" : { "exists": { "field": "year" }}}},
+ {"bool": { "must_not" : { "exists": { "field": "type" }}}},
+ {"bool": { "must_not" : { "exists": { "field": "stage" }}}},
+ {"bool": { "must_not" : { "exists": { "field": "biblio.container_name" }}}},
+ ],
+ },
+ },
+ "negative_boost": 0.5,
},
- "negative_boost": 0.5,
},
- },
- "size": size,
- "sort": [ sort_mode ],
- "track_total_hits": true,
- })
- .to_string();
+ "size": size,
+ "sort": [ sort_mode ],
+ "track_total_hits": true,
+ }),
+ }.to_string();
let mut request = http_client
.get(&request_url)