From 347facfebf03f6ce404c372179f6d974bbcff19e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 25 May 2021 17:11:57 -0700 Subject: download: fix wayback URL rewriting, and add a minimal test --- fatcat-cli/src/download.rs | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'fatcat-cli') diff --git a/fatcat-cli/src/download.rs b/fatcat-cli/src/download.rs index 4cb3143..44a11b7 100644 --- a/fatcat-cli/src/download.rs +++ b/fatcat-cli/src/download.rs @@ -84,16 +84,21 @@ impl std::io::Write for Sha1WriteWrapper { } // eg, https://web.archive.org/web/20140802044207/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf -fn rewrite_wayback_url(url: Url) -> Result { - // TODO: make this function correct, and add tests +fn rewrite_wayback_url(mut url: Url) -> Result { let mut segments: Vec = url .path_segments() .unwrap() .map(|x| x.to_string()) .collect(); - if segments[0] == "web" && segments[1].len() == 14 { + if segments.len() < 3 || url.host_str() != Some("web.archive.org") { + return Err(anyhow!("not a valid wayback URL: {:?}", url)); + } + // NOTE: the "12" digit timestamp here is to hack around a fraction of bad metadata in fatcat + // catalog circa 2019. should be cleaned up eventually then this can be deprecated. + if segments[0] == "web" && (segments[1].len() == 14 || segments[1].len() == 12) { segments[1] = format!("{}id_", segments[1]); } + url.set_path(&segments.join("/")); Ok(url) } @@ -440,3 +445,22 @@ pub fn download_batch( done_receiver.recv()?; Ok(count) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rewrite_wayback_url() -> () { + assert_eq!(rewrite_wayback_url(Url::parse("https://web.archive.org/web/20140802044207/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf").unwrap()).unwrap(), + Url::parse("https://web.archive.org/web/20140802044207id_/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf").unwrap()); + + // not a wayback URL + assert!(rewrite_wayback_url(Url::parse("https://fatcat.wiki/blah.pdf").unwrap()).is_err()); + + // too short + assert!( + rewrite_wayback_url(Url::parse("https://web.archive.org/file.pdf").unwrap()).is_err() + ); + } +} -- cgit v1.2.3