From 44fdd4333721ba400bd98d53bfe89897bebe8be5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Feb 2021 18:50:52 -0800 Subject: download: verify sha1 at download time --- Cargo.lock | 7 +++++++ fatcat-cli/Cargo.toml | 1 + fatcat-cli/src/download.rs | 47 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3211669..da300b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -313,6 +313,7 @@ dependencies = [ "reqwest", "serde 1.0.123", "serde_json 1.0.55", + "sha1", "structopt", "swagger", "tabwriter", @@ -1544,6 +1545,12 @@ dependencies = [ "url", ] +[[package]] +name = "sha1" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d" + [[package]] name = "slab" version = "0.4.2" diff --git a/fatcat-cli/Cargo.toml b/fatcat-cli/Cargo.toml index ac65298..1cb7aab 100644 --- a/fatcat-cli/Cargo.toml +++ b/fatcat-cli/Cargo.toml @@ -38,6 +38,7 @@ chrono-humanize = "*" tempfile = "3" indicatif = "0.15" url = "*" +sha1 = { version = "*", features = ["std"] } [dev-dependencies] assert_cmd = "1" diff --git a/fatcat-cli/src/download.rs b/fatcat-cli/src/download.rs index 2297652..5bec0d4 100644 --- a/fatcat-cli/src/download.rs +++ b/fatcat-cli/src/download.rs @@ -9,6 +9,7 @@ use std::io::{self, BufRead}; use std::path::PathBuf; use url::Url; use crate::{ApiModelIdent, Specifier}; +use sha1::Sha1; #[derive(Debug, PartialEq, Clone)] pub enum DownloadStatus { @@ -51,6 +52,32 @@ impl fmt::Display for DownloadStatus { } } +struct Sha1WriteWrapper { + writer: W, + hasher: Sha1, +} + +impl Sha1WriteWrapper { + fn new(writer: W) -> Self { + Sha1WriteWrapper { writer, hasher: Sha1::new() } + } + + fn into_hexdigest(self) -> String { + self.hasher.hexdigest() + } +} + +impl std::io::Write for Sha1WriteWrapper { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.hasher.update(buf); + self.writer.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.writer.flush() + } +} + // eg, https://web.archive.org/web/20140802044207/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf fn rewrite_wayback_url(url: Url) -> Result { // TODO: make this function correct, and add tests @@ -71,8 +98,8 @@ fn default_filename(specifier: &Specifier, fe: &FileEntity) -> Result { Some("application/pdf") => ".pdf", Some("application/postscript") => ".ps", Some("text/html") => ".html", - Some("text/xml") => ".xml", - // NOTE: most commonly .pdf if no type specified. should remove this after updating + Some("text/xml") | Some("application/xml") | Some("application/xml+jats") => ".xml", + // NOTE: most commonly .pdf if no type specified. should remove this default after updating // remaining file entities None => ".pdf", _ => "", @@ -84,7 +111,7 @@ fn default_filename(specifier: &Specifier, fe: &FileEntity) -> Result { /// Attempts to download a file entity, including verifying checksum. pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option) -> Result { - match &fe.sha1 { + let expected_sha1 = match &fe.sha1 { Some(v) => v, None => return Ok(DownloadStatus::FileMissingMetadata), }; @@ -140,12 +167,12 @@ pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option Ok(f) => f, }; - // TODO: print to stderr + // TODO: print to stderr (?) info!("downloading: {}", url); let client = reqwest::blocking::Client::new(); let mut resp = match client .get(url) - .header(USER_AGENT, "fatcat-cli/0.0.0") + .header(USER_AGENT, format!("fatcat-cli/{}", env!("CARGO_PKG_VERSION"))) .send() { Ok(r) => r, @@ -167,11 +194,13 @@ pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option } } + let pb = ProgressBar::new(fe.size.unwrap() as u64); pb.set_style(ProgressStyle::default_bar() .template("{spinner:.green} [{elapsed_precise}] [{bar:40}] {bytes}/{total_bytes} ({eta})") .progress_chars("#>-")); - let out_size = match resp.copy_to(&mut pb.wrap_write(download_file)) { + let mut wrapped_file = Sha1WriteWrapper::new(pb.wrap_write(download_file)); + let out_size = match resp.copy_to(&mut wrapped_file) { Ok(r) => r, Err(e) => { std::fs::remove_file(download_path)?; @@ -179,6 +208,12 @@ pub fn download_file(fe: &FileEntity, specifier: &Specifier, output_path: Option } }; + let out_sha1 = wrapped_file.into_hexdigest(); + if &out_sha1 != expected_sha1 { + std::fs::remove_file(download_path)?; + return Ok(DownloadStatus::WrongHash); + } + if out_size != expected_size { std::fs::remove_file(download_path)?; return Ok(DownloadStatus::WrongSize); -- cgit v1.2.3