1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
use anyhow::{anyhow, Context, Result};
use indicatif::ProgressBar;
use fatcat_openapi::models::FileEntity;
use reqwest::header::USER_AGENT;
use url::Url;
use std::fs::File;
#[derive(Debug, PartialEq, Clone)]
pub enum DownloadStatus {
Exists(String),
Downloaded(String),
NetworkError(String),
NoAccess,
NotYet,
}
// eg, https://web.archive.org/web/20140802044207/http://www.geo.coop:80/sites/default/files/labs_of_oligarchy.pdf
fn rewrite_wayback_url(url: Url) -> Result<Url> {
// TODO: make this function correct, and add tests
let mut segments: Vec<String> = url.path_segments().unwrap().map(|x| x.to_string()).collect();
if segments[0] == "web" && segments[1].len() == 14 {
segments[1] = format!("{}id_", segments[1]);
}
Ok(url)
}
/// Attempts to download a file entity, including verifying checksum.
pub fn download_file(fe: FileEntity) -> Result<DownloadStatus> {
// TODO: check if file has sha1hex
// TODO: check if file already exists
// TODO: only archive.org URLs
let raw_url = fe.urls.unwrap()[0].url.clone();
let mut url = Url::parse(&raw_url)?;
if url.host_str() == Some("web.archive.org") {
url = rewrite_wayback_url(url)?;
}
// TODO: open temporary file (real file plus suffix?)
let out_file = File::create(format!("{}.pdf", fe.sha1.unwrap()))?;
println!("downloading: {}", url);
let client = reqwest::blocking::Client::new();
let mut resp = client.get(url)
.header(USER_AGENT, "fatcat-cli/0.0.0")
.send()?;
// TODO: parse headers
// TODO: resp.error_for_status()?;
if !resp.status().is_success() {
return Ok(DownloadStatus::NetworkError(format!("{}", resp.status())));
}
// TODO: what if no filesize?
// TODO: compare with resp.content_length(() -> Option<u64>
let pb = ProgressBar::new(fe.size.unwrap() as u64);
let out_size = resp.copy_to(&mut pb.wrap_write(out_file))?;
Ok(DownloadStatus::NotYet)
}
|