summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-25 17:11:57 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-25 17:11:57 -0700
commit347facfebf03f6ce404c372179f6d974bbcff19e (patch)
tree241918711e326e325f2fb97891e40e8020463a4e
parente4ef625255f2d4130c5387085f9d5555691f3afe (diff)
downloadfatcat-cli-347facfebf03f6ce404c372179f6d974bbcff19e.tar.gz
fatcat-cli-347facfebf03f6ce404c372179f6d974bbcff19e.zip
download: fix wayback URL rewriting, and add a minimal test
-rw-r--r--fatcat-cli/src/download.rs30
1 files changed, 27 insertions, 3 deletions
diff --git a/fatcat-cli/src/download.rs b/fatcat-cli/src/download.rs
index 4cb3143..44a11b7 100644
--- a/fatcat-cli/src/download.rs
+++ b/fatcat-cli/src/download.rs
@@ -84,16 +84,21 @@ impl<W: std::io::Write> std::io::Write for Sha1WriteWrapper<W> {
}
// eg, https://web.archive.org/web/20140802044207/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf
-fn rewrite_wayback_url(url: Url) -> Result<Url> {
- // TODO: make this function correct, and add tests
+fn rewrite_wayback_url(mut url: Url) -> Result<Url> {
let mut segments: Vec<String> = url
.path_segments()
.unwrap()
.map(|x| x.to_string())
.collect();
- if segments[0] == "web" && segments[1].len() == 14 {
+ if segments.len() < 3 || url.host_str() != Some("web.archive.org") {
+ return Err(anyhow!("not a valid wayback URL: {:?}", url));
+ }
+ // NOTE: the "12" digit timestamp here is to hack around a fraction of bad metadata in fatcat
+ // catalog circa 2019. should be cleaned up eventually then this can be deprecated.
+ if segments[0] == "web" && (segments[1].len() == 14 || segments[1].len() == 12) {
segments[1] = format!("{}id_", segments[1]);
}
+ url.set_path(&segments.join("/"));
Ok(url)
}
@@ -440,3 +445,22 @@ pub fn download_batch(
done_receiver.recv()?;
Ok(count)
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_rewrite_wayback_url() -> () {
+ assert_eq!(rewrite_wayback_url(Url::parse("https://web.archive.org/web/20140802044207/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf").unwrap()).unwrap(),
+ Url::parse("https://web.archive.org/web/20140802044207id_/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf").unwrap());
+
+ // not a wayback URL
+ assert!(rewrite_wayback_url(Url::parse("https://fatcat.wiki/blah.pdf").unwrap()).is_err());
+
+ // too short
+ assert!(
+ rewrite_wayback_url(Url::parse("https://web.archive.org/file.pdf").unwrap()).is_err()
+ );
+ }
+}