aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-02-10 18:50:52 -0800
committerBryan Newbold <bnewbold@archive.org>2021-02-10 18:50:52 -0800
commit44fdd4333721ba400bd98d53bfe89897bebe8be5 (patch)
treea33118a1c1ac00bd57abc12f75cb83b221b00142
parent41e56409b89bcb7918221d088c685776884f3983 (diff)
downloadfatcat-cli-44fdd4333721ba400bd98d53bfe89897bebe8be5.tar.gz
fatcat-cli-44fdd4333721ba400bd98d53bfe89897bebe8be5.zip
download: verify sha1 at download time
-rw-r--r--Cargo.lock7
-rw-r--r--fatcat-cli/Cargo.toml1
-rw-r--r--fatcat-cli/src/download.rs47
3 files changed, 49 insertions, 6 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 3211669..da300b7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -313,6 +313,7 @@ dependencies = [
"reqwest",
"serde 1.0.123",
"serde_json 1.0.55",
+ "sha1",
"structopt",
"swagger",
"tabwriter",
@@ -1545,6 +1546,12 @@ dependencies = [
]
[[package]]
+name = "sha1"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
+
+[[package]]
name = "slab"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/fatcat-cli/Cargo.toml b/fatcat-cli/Cargo.toml
index ac65298..1cb7aab 100644
--- a/fatcat-cli/Cargo.toml
+++ b/fatcat-cli/Cargo.toml
@@ -38,6 +38,7 @@ chrono-humanize = "*"
tempfile = "3"
indicatif = "0.15"
url = "*"
+sha1 = { version = "*", features = ["std"] }
[dev-dependencies]
assert_cmd = "1"
diff --git a/fatcat-cli/src/download.rs b/fatcat-cli/src/download.rs
index 2297652..5bec0d4 100644
--- a/fatcat-cli/src/download.rs
+++ b/fatcat-cli/src/download.rs
@@ -9,6 +9,7 @@ use std::io::{self, BufRead};
use std::path::PathBuf;
use url::Url;
use crate::{ApiModelIdent, Specifier};
+use sha1::Sha1;
#[derive(Debug, PartialEq, Clone)]
pub enum DownloadStatus {
@@ -51,6 +52,32 @@ impl fmt::Display for DownloadStatus {
}
}
+struct Sha1WriteWrapper<W> {
+ writer: W,
+ hasher: Sha1,
+}
+
+impl<W> Sha1WriteWrapper<W> {
+ fn new(writer: W) -> Self {
+ Sha1WriteWrapper { writer, hasher: Sha1::new() }
+ }
+
+ fn into_hexdigest(self) -> String {
+ self.hasher.hexdigest()
+ }
+}
+
+impl<W: std::io::Write> std::io::Write for Sha1WriteWrapper<W> {
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+ self.hasher.update(buf);
+ self.writer.write(buf)
+ }
+
+ fn flush(&mut self) -> std::io::Result<()> {
+ self.writer.flush()
+ }
+}
+
// eg, https://web.archive.org/web/20140802044207/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf
fn rewrite_wayback_url(url: Url) -> Result<Url> {
// TODO: make this function correct, and add tests
@@ -71,8 +98,8 @@ fn default_filename(specifier: &Specifier, fe: &FileEntity) -> Result<PathBuf> {
Some("application/pdf") => ".pdf",
Some("application/postscript") => ".ps",
Some("text/html") => ".html",
- Some("text/xml") => ".xml",
- // NOTE: most commonly .pdf if no type specified. should remove this after updating
+ Some("text/xml") | Some("application/xml") | Some("application/xml+jats") => ".xml",
+ // NOTE: most commonly .pdf if no type specified. should remove this default after updating
// remaining file entities
None => ".pdf",
_ => "",
@@ -84,7 +111,7 @@ fn default_filename(specifier: &Specifier, fe: &FileEntity) -> Result<PathBuf> {
/// Attempts to download a file entity, including verifying checksum.
pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option<PathBuf>) -> Result<DownloadStatus> {
- match &fe.sha1 {
+ let expected_sha1 = match &fe.sha1 {
Some(v) => v,
None => return Ok(DownloadStatus::FileMissingMetadata),
};
@@ -140,12 +167,12 @@ pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option
Ok(f) => f,
};
- // TODO: print to stderr
+ // TODO: print to stderr (?)
info!("downloading: {}", url);
let client = reqwest::blocking::Client::new();
let mut resp = match client
.get(url)
- .header(USER_AGENT, "fatcat-cli/0.0.0")
+ .header(USER_AGENT, format!("fatcat-cli/{}", env!("CARGO_PKG_VERSION")))
.send()
{
Ok(r) => r,
@@ -167,11 +194,13 @@ pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option
}
}
+
let pb = ProgressBar::new(fe.size.unwrap() as u64);
pb.set_style(ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{bar:40}] {bytes}/{total_bytes} ({eta})")
.progress_chars("#>-"));
- let out_size = match resp.copy_to(&mut pb.wrap_write(download_file)) {
+ let mut wrapped_file = Sha1WriteWrapper::new(pb.wrap_write(download_file));
+ let out_size = match resp.copy_to(&mut wrapped_file) {
Ok(r) => r,
Err(e) => {
std::fs::remove_file(download_path)?;
@@ -179,6 +208,12 @@ pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option
}
};
+ let out_sha1 = wrapped_file.into_hexdigest();
+ if &out_sha1 != expected_sha1 {
+ std::fs::remove_file(download_path)?;
+ return Ok(DownloadStatus::WrongHash);
+ }
+
if out_size != expected_size {
std::fs::remove_file(download_path)?;
return Ok(DownloadStatus::WrongSize);