summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-02-10 14:09:47 -0800
committerBryan Newbold <bnewbold@archive.org>2021-02-10 14:09:47 -0800
commit64b7dd2cd2e55008851ba5011032433e568d3544 (patch)
treeb479dec3e4f88a641563894fd5c6491aded31d3f
parent29f2b072a6395edf8527de9a5ae76d53a045819c (diff)
downloadfatcat-cli-64b7dd2cd2e55008851ba5011032433e568d3544.tar.gz
fatcat-cli-64b7dd2cd2e55008851ba5011032433e568d3544.zip
download output path/directory args
-rw-r--r--fatcat-cli/src/download.rs65
-rw-r--r--fatcat-cli/src/main.rs45
2 files changed, 75 insertions, 35 deletions
diff --git a/fatcat-cli/src/download.rs b/fatcat-cli/src/download.rs
index 7821b70..6a420b0 100644
--- a/fatcat-cli/src/download.rs
+++ b/fatcat-cli/src/download.rs
@@ -6,7 +6,6 @@ use reqwest::header::USER_AGENT;
use std::fmt;
use std::fs::File;
use std::io::{self, BufRead};
-use std::path::Path;
use std::path::PathBuf;
use url::Url;
@@ -65,9 +64,27 @@ fn rewrite_wayback_url(url: Url) -> Result<Url> {
Ok(url)
}
+fn default_filename(fe: &FileEntity) -> Result<PathBuf> {
+
+ let sha1hex = &fe.sha1.clone().unwrap();
+ let file_suffix = match fe.mimetype.as_ref().map(String::as_str) {
+ Some("application/pdf") => ".pdf",
+ Some("application/postscript") => ".ps",
+ Some("text/html") => ".html",
+ Some("text/xml") => ".xml",
+ // NOTE: most commonly .pdf if no type specified. should remove this after updating
+ // remaining file entities
+ None => ".pdf",
+ _ => "",
+ };
+
+ let path_string = format!("{}{}", sha1hex, file_suffix);
+ Ok(PathBuf::from(&path_string))
+}
+
/// Attempts to download a file entity, including verifying checksum.
-pub fn download_file(fe: &FileEntity) -> Result<DownloadStatus> {
- let sha1hex = match &fe.sha1 {
+pub fn download_file(fe: &FileEntity, output_path: Option<PathBuf>) -> Result<DownloadStatus> {
+ match &fe.sha1 {
Some(v) => v,
None => return Ok(DownloadStatus::FileMissingMetadata),
};
@@ -76,18 +93,16 @@ pub fn download_file(fe: &FileEntity) -> Result<DownloadStatus> {
None => return Ok(DownloadStatus::FileMissingMetadata),
};
- let file_suffix = match fe.mimetype.as_ref().map(String::as_str) {
- Some("application/pdf") => ".pdf",
- Some("application/postscript") => ".pdf",
- Some("text/html") => ".html",
- Some("text/xml") => ".xml",
- _ => ".unknown",
+ let final_path = match output_path {
+ Some(ref path) if path.is_dir() => {
+ let mut full = output_path.unwrap_or(PathBuf::new());
+ full.push(default_filename(fe)?);
+ full
+ }
+ Some(path) => path,
+ None => default_filename(fe)?,
};
- // TODO: output directory
- let path_string = format!("{}{}", sha1hex, file_suffix);
- let final_path = Path::new(&path_string);
-
// NOTE: this isn't perfect; there could have been a race condition
if final_path.exists() {
return Ok(DownloadStatus::Exists(
@@ -95,8 +110,7 @@ pub fn download_file(fe: &FileEntity) -> Result<DownloadStatus> {
));
};
- let path_string = format!("{}{}.partial", sha1hex, file_suffix);
- let download_path = Path::new(&path_string);
+ let download_path = final_path.with_extension("download");
// TODO: only archive.org URLs (?)
let raw_url = match fe.urls.as_ref() {
@@ -115,7 +129,7 @@ pub fn download_file(fe: &FileEntity) -> Result<DownloadStatus> {
let download_file = match std::fs::OpenOptions::new()
.write(true)
.create_new(true)
- .open(download_path)
+ .open(&download_path)
{
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
return Ok(DownloadStatus::PartialExists(
@@ -163,13 +177,13 @@ pub fn download_file(fe: &FileEntity) -> Result<DownloadStatus> {
return Ok(DownloadStatus::WrongSize);
}
- std::fs::rename(download_path, final_path)?;
+ std::fs::rename(download_path, &final_path)?;
Ok(DownloadStatus::Downloaded(
final_path.to_string_lossy().to_string(),
))
}
-pub fn download_release(re: &ReleaseEntity) -> Result<DownloadStatus> {
+pub fn download_release(re: &ReleaseEntity, output_path: Option<PathBuf>) -> Result<DownloadStatus> {
let file_entities = match &re.files {
None => {
return Err(anyhow!(
@@ -180,7 +194,7 @@ pub fn download_release(re: &ReleaseEntity) -> Result<DownloadStatus> {
};
let mut status = DownloadStatus::NoPublicFile;
for fe in file_entities {
- status = download_file(&fe)?;
+ status = download_file(&fe, output_path.clone())?;
match status {
DownloadStatus::Exists(_) | DownloadStatus::Downloaded(_) => break,
_ => (),
@@ -190,11 +204,11 @@ pub fn download_release(re: &ReleaseEntity) -> Result<DownloadStatus> {
}
/// Tries either file or release
-fn download_entity(json_str: String) -> Result<DownloadStatus> {
+fn download_entity(json_str: String, output_path: Option<PathBuf>) -> Result<DownloadStatus> {
let release_attempt = serde_json::from_str::<ReleaseEntity>(&json_str);
if let Ok(re) = release_attempt {
if re.ident.is_some() && (re.title.is_some() || re.files.is_some()) {
- let status = download_release(&re)?;
+ let status = download_release(&re, output_path)?;
println!(
"release_{}\t{}\t{}",
re.ident.unwrap(),
@@ -209,7 +223,7 @@ fn download_entity(json_str: String) -> Result<DownloadStatus> {
match file_attempt {
Ok(fe) => {
if fe.ident.is_some() && fe.urls.is_some() {
- let status = download_file(&fe)?;
+ let status = download_file(&fe, output_path)?;
println!(
"file_{}\t{}\t{}",
fe.ident.unwrap(),
@@ -225,7 +239,8 @@ fn download_entity(json_str: String) -> Result<DownloadStatus> {
}
}
-pub fn download_batch(input_path: Option<PathBuf>, limit: Option<u64>) -> Result<u64> {
+pub fn download_batch(input_path: Option<PathBuf>, output_dir: Option<PathBuf>, limit: Option<u64>, _jobs: u64) -> Result<u64> {
+ // TODO: create worker pipeline using channels
let count = 0;
match input_path {
None => {
@@ -234,7 +249,7 @@ pub fn download_batch(input_path: Option<PathBuf>, limit: Option<u64>) -> Result
let lines = stdin_lock.lines();
for line in lines {
let json_str = line?;
- download_entity(json_str)?;
+ download_entity(json_str, output_dir.clone())?;
if let Some(limit) = limit {
if count >= limit {
break;
@@ -248,7 +263,7 @@ pub fn download_batch(input_path: Option<PathBuf>, limit: Option<u64>) -> Result
let lines = buffered.lines();
for line in lines {
let json_str = line?;
- download_entity(json_str)?;
+ download_entity(json_str, output_dir.clone())?;
if let Some(limit) = limit {
if count >= limit {
break;
diff --git a/fatcat-cli/src/main.rs b/fatcat-cli/src/main.rs
index 055ac41..760f851 100644
--- a/fatcat-cli/src/main.rs
+++ b/fatcat-cli/src/main.rs
@@ -20,7 +20,7 @@ struct Opt {
)]
api_host: String,
- /// API auth tokens can be generated from the account page in the fatcat.wiki web interface
+ // API auth tokens can be generated from the account page in the fatcat.wiki web interface
#[structopt(
global = true,
long = "--api-token",
@@ -127,7 +127,13 @@ enum BatchCommand {
#[structopt(long)]
auto_accept: bool,
},
- Download {},
+ Download {
+ #[structopt(long, short = "-o", parse(from_os_str))]
+ output_dir: Option<PathBuf>,
+
+ #[structopt(long, short = "-j", default_value = "1")]
+ jobs: u64,
+ },
}
#[derive(StructOpt)]
@@ -151,7 +157,7 @@ enum Command {
entity_type: EntityType,
/// Input file, "-" for stdin.
- #[structopt(long = "--file", short = "-f", parse(from_os_str))]
+ #[structopt(long = "--input-file", short = "-i", parse(from_os_str))]
input_path: Option<PathBuf>,
#[structopt(
@@ -166,7 +172,7 @@ enum Command {
specifier: Specifier,
/// Input file, "-" for stdin.
- #[structopt(long = "--file", short = "-f", parse(from_os_str))]
+ #[structopt(long = "--input-file", short = "-i", parse(from_os_str))]
input_path: Option<PathBuf>,
#[structopt(
@@ -213,6 +219,9 @@ enum Command {
},
Download {
specifier: Specifier,
+
+ #[structopt(long = "--output-dir", short = "-o", parse(from_os_str))]
+ output_path: Option<PathBuf>,
},
History {
specifier: Specifier,
@@ -266,7 +275,7 @@ enum Command {
cmd: BatchCommand,
/// Input file, "-" for stdin.
- #[structopt(long = "--file", short = "-f", parse(from_os_str))]
+ #[structopt(long = "--input-file", short = "-i", parse(from_os_str))]
input_path: Option<PathBuf>,
#[structopt(long)]
@@ -463,14 +472,25 @@ fn run(opt: Opt) -> Result<()> {
batch.run(&mut api_client, input_path, BatchOp::Delete, None)?;
}
Command::Batch {
- cmd: BatchCommand::Download {},
+ cmd: BatchCommand::Download {
+ jobs,
+ output_dir,
+ },
input_path,
limit,
} => {
let input_path = path_or_stdin(input_path);
- download_batch(input_path, limit)?;
+ if let Some(ref dir) = output_dir {
+ if !dir.is_dir() {
+ return Err(anyhow!("output directory doesn't exist"));
+ }
+ }
+ download_batch(input_path, output_dir, limit, jobs)?;
}
- Command::Download { specifier } => {
+ Command::Download {
+ specifier,
+ output_path,
+ } => {
// run lookups if necessary (inefficient)
let specifier = match specifier {
Specifier::ReleaseLookup(_, _) | Specifier::FileLookup(_, _) => {
@@ -478,6 +498,11 @@ fn run(opt: Opt) -> Result<()> {
}
_ => specifier,
};
+ if let Some(ref path) = output_path {
+ if path.exists() {
+ return Err(anyhow!("refusing to over-write output file"));
+ }
+ }
let status = match specifier {
Specifier::Release(ident) => {
let result = api_client.rt.block_on(api_client.api.get_release(
@@ -490,7 +515,7 @@ fn run(opt: Opt) -> Result<()> {
resp => Err(anyhow!("{:?}", resp))
.with_context(|| format!("API GET failed: {:?}", ident)),
}?;
- download_release(&release_entity)
+ download_release(&release_entity, output_path)
}
Specifier::File(ident) => {
let result = api_client.rt.block_on(api_client.api.get_file(
@@ -503,7 +528,7 @@ fn run(opt: Opt) -> Result<()> {
resp => Err(anyhow!("{:?}", resp))
.with_context(|| format!("API GET failed: {:?}", ident)),
}?;
- download_file(&file_entity)
+ download_file(&file_entity, output_path)
}
other => Err(anyhow!("Don't know how to download: {:?}", other)),
}?;