From 64b7dd2cd2e55008851ba5011032433e568d3544 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Feb 2021 14:09:47 -0800 Subject: download output path/directory args --- fatcat-cli/src/download.rs | 65 ++++++++++++++++++++++++++++------------------ fatcat-cli/src/main.rs | 45 +++++++++++++++++++++++++------- 2 files changed, 75 insertions(+), 35 deletions(-) (limited to 'fatcat-cli/src') diff --git a/fatcat-cli/src/download.rs b/fatcat-cli/src/download.rs index 7821b70..6a420b0 100644 --- a/fatcat-cli/src/download.rs +++ b/fatcat-cli/src/download.rs @@ -6,7 +6,6 @@ use reqwest::header::USER_AGENT; use std::fmt; use std::fs::File; use std::io::{self, BufRead}; -use std::path::Path; use std::path::PathBuf; use url::Url; @@ -65,9 +64,27 @@ fn rewrite_wayback_url(url: Url) -> Result { Ok(url) } +fn default_filename(fe: &FileEntity) -> Result { + + let sha1hex = &fe.sha1.clone().unwrap(); + let file_suffix = match fe.mimetype.as_ref().map(String::as_str) { + Some("application/pdf") => ".pdf", + Some("application/postscript") => ".ps", + Some("text/html") => ".html", + Some("text/xml") => ".xml", + // NOTE: most commonly .pdf if no type specified. should remove this after updating + // remaining file entities + None => ".pdf", + _ => "", + }; + + let path_string = format!("{}{}", sha1hex, file_suffix); + Ok(PathBuf::from(&path_string)) +} + /// Attempts to download a file entity, including verifying checksum. -pub fn download_file(fe: &FileEntity) -> Result { - let sha1hex = match &fe.sha1 { +pub fn download_file(fe: &FileEntity, output_path: Option) -> Result { + match &fe.sha1 { Some(v) => v, None => return Ok(DownloadStatus::FileMissingMetadata), }; @@ -76,18 +93,16 @@ pub fn download_file(fe: &FileEntity) -> Result { None => return Ok(DownloadStatus::FileMissingMetadata), }; - let file_suffix = match fe.mimetype.as_ref().map(String::as_str) { - Some("application/pdf") => ".pdf", - Some("application/postscript") => ".pdf", - Some("text/html") => ".html", - Some("text/xml") => ".xml", - _ => ".unknown", + let final_path = match output_path { + Some(ref path) if path.is_dir() => { + let mut full = output_path.unwrap_or(PathBuf::new()); + full.push(default_filename(fe)?); + full + } + Some(path) => path, + None => default_filename(fe)?, }; - // TODO: output directory - let path_string = format!("{}{}", sha1hex, file_suffix); - let final_path = Path::new(&path_string); - // NOTE: this isn't perfect; there could have been a race condition if final_path.exists() { return Ok(DownloadStatus::Exists( @@ -95,8 +110,7 @@ pub fn download_file(fe: &FileEntity) -> Result { )); }; - let path_string = format!("{}{}.partial", sha1hex, file_suffix); - let download_path = Path::new(&path_string); + let download_path = final_path.with_extension("download"); // TODO: only archive.org URLs (?) let raw_url = match fe.urls.as_ref() { @@ -115,7 +129,7 @@ pub fn download_file(fe: &FileEntity) -> Result { let download_file = match std::fs::OpenOptions::new() .write(true) .create_new(true) - .open(download_path) + .open(&download_path) { Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { return Ok(DownloadStatus::PartialExists( @@ -163,13 +177,13 @@ pub fn download_file(fe: &FileEntity) -> Result { return Ok(DownloadStatus::WrongSize); } - std::fs::rename(download_path, final_path)?; + std::fs::rename(download_path, &final_path)?; Ok(DownloadStatus::Downloaded( final_path.to_string_lossy().to_string(), )) } -pub fn download_release(re: &ReleaseEntity) -> Result { +pub fn download_release(re: &ReleaseEntity, output_path: Option) -> Result { let file_entities = match &re.files { None => { return Err(anyhow!( @@ -180,7 +194,7 @@ pub fn download_release(re: &ReleaseEntity) -> Result { }; let mut status = DownloadStatus::NoPublicFile; for fe in file_entities { - status = download_file(&fe)?; + status = download_file(&fe, output_path.clone())?; match status { DownloadStatus::Exists(_) | DownloadStatus::Downloaded(_) => break, _ => (), @@ -190,11 +204,11 @@ pub fn download_release(re: &ReleaseEntity) -> Result { } /// Tries either file or release -fn download_entity(json_str: String) -> Result { +fn download_entity(json_str: String, output_path: Option) -> Result { let release_attempt = serde_json::from_str::(&json_str); if let Ok(re) = release_attempt { if re.ident.is_some() && (re.title.is_some() || re.files.is_some()) { - let status = download_release(&re)?; + let status = download_release(&re, output_path)?; println!( "release_{}\t{}\t{}", re.ident.unwrap(), @@ -209,7 +223,7 @@ fn download_entity(json_str: String) -> Result { match file_attempt { Ok(fe) => { if fe.ident.is_some() && fe.urls.is_some() { - let status = download_file(&fe)?; + let status = download_file(&fe, output_path)?; println!( "file_{}\t{}\t{}", fe.ident.unwrap(), @@ -225,7 +239,8 @@ fn download_entity(json_str: String) -> Result { } } -pub fn download_batch(input_path: Option, limit: Option) -> Result { +pub fn download_batch(input_path: Option, output_dir: Option, limit: Option, _jobs: u64) -> Result { + // TODO: create worker pipeline using channels let count = 0; match input_path { None => { @@ -234,7 +249,7 @@ pub fn download_batch(input_path: Option, limit: Option) -> Result let lines = stdin_lock.lines(); for line in lines { let json_str = line?; - download_entity(json_str)?; + download_entity(json_str, output_dir.clone())?; if let Some(limit) = limit { if count >= limit { break; @@ -248,7 +263,7 @@ pub fn download_batch(input_path: Option, limit: Option) -> Result let lines = buffered.lines(); for line in lines { let json_str = line?; - download_entity(json_str)?; + download_entity(json_str, output_dir.clone())?; if let Some(limit) = limit { if count >= limit { break; diff --git a/fatcat-cli/src/main.rs b/fatcat-cli/src/main.rs index 055ac41..760f851 100644 --- a/fatcat-cli/src/main.rs +++ b/fatcat-cli/src/main.rs @@ -20,7 +20,7 @@ struct Opt { )] api_host: String, - /// API auth tokens can be generated from the account page in the fatcat.wiki web interface + // API auth tokens can be generated from the account page in the fatcat.wiki web interface #[structopt( global = true, long = "--api-token", @@ -127,7 +127,13 @@ enum BatchCommand { #[structopt(long)] auto_accept: bool, }, - Download {}, + Download { + #[structopt(long, short = "-o", parse(from_os_str))] + output_dir: Option, + + #[structopt(long, short = "-j", default_value = "1")] + jobs: u64, + }, } #[derive(StructOpt)] @@ -151,7 +157,7 @@ enum Command { entity_type: EntityType, /// Input file, "-" for stdin. - #[structopt(long = "--file", short = "-f", parse(from_os_str))] + #[structopt(long = "--input-file", short = "-i", parse(from_os_str))] input_path: Option, #[structopt( @@ -166,7 +172,7 @@ enum Command { specifier: Specifier, /// Input file, "-" for stdin. - #[structopt(long = "--file", short = "-f", parse(from_os_str))] + #[structopt(long = "--input-file", short = "-i", parse(from_os_str))] input_path: Option, #[structopt( @@ -213,6 +219,9 @@ enum Command { }, Download { specifier: Specifier, + + #[structopt(long = "--output-dir", short = "-o", parse(from_os_str))] + output_path: Option, }, History { specifier: Specifier, @@ -266,7 +275,7 @@ enum Command { cmd: BatchCommand, /// Input file, "-" for stdin. - #[structopt(long = "--file", short = "-f", parse(from_os_str))] + #[structopt(long = "--input-file", short = "-i", parse(from_os_str))] input_path: Option, #[structopt(long)] @@ -463,14 +472,25 @@ fn run(opt: Opt) -> Result<()> { batch.run(&mut api_client, input_path, BatchOp::Delete, None)?; } Command::Batch { - cmd: BatchCommand::Download {}, + cmd: BatchCommand::Download { + jobs, + output_dir, + }, input_path, limit, } => { let input_path = path_or_stdin(input_path); - download_batch(input_path, limit)?; + if let Some(ref dir) = output_dir { + if !dir.is_dir() { + return Err(anyhow!("output directory doesn't exist")); + } + } + download_batch(input_path, output_dir, limit, jobs)?; } - Command::Download { specifier } => { + Command::Download { + specifier, + output_path, + } => { // run lookups if necessary (inefficient) let specifier = match specifier { Specifier::ReleaseLookup(_, _) | Specifier::FileLookup(_, _) => { @@ -478,6 +498,11 @@ fn run(opt: Opt) -> Result<()> { } _ => specifier, }; + if let Some(ref path) = output_path { + if path.exists() { + return Err(anyhow!("refusing to over-write output file")); + } + } let status = match specifier { Specifier::Release(ident) => { let result = api_client.rt.block_on(api_client.api.get_release( @@ -490,7 +515,7 @@ fn run(opt: Opt) -> Result<()> { resp => Err(anyhow!("{:?}", resp)) .with_context(|| format!("API GET failed: {:?}", ident)), }?; - download_release(&release_entity) + download_release(&release_entity, output_path) } Specifier::File(ident) => { let result = api_client.rt.block_on(api_client.api.get_file( @@ -503,7 +528,7 @@ fn run(opt: Opt) -> Result<()> { resp => Err(anyhow!("{:?}", resp)) .with_context(|| format!("API GET failed: {:?}", ident)), }?; - download_file(&file_entity) + download_file(&file_entity, output_path) } other => Err(anyhow!("Don't know how to download: {:?}", other)), }?; -- cgit v1.2.3