aboutsummaryrefslogtreecommitdiffstats
path: root/rust
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-11 22:09:58 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-11 22:14:36 -0700
commitf7f18bb588023e869c347ddfd23bf1ed5d16e527 (patch)
tree98f39ef82561ffa74dfe64740391e595995df89c /rust
parent538cd2ec602a044dc36ca81d9a2a07863080c764 (diff)
downloadfatcat-f7f18bb588023e869c347ddfd23bf1ed5d16e527.tar.gz
fatcat-f7f18bb588023e869c347ddfd23bf1ed5d16e527.zip
improvements to fatcat-export output
Diffstat (limited to 'rust')
-rw-r--r--rust/Cargo.lock57
-rw-r--r--rust/Cargo.toml1
-rw-r--r--rust/README.export.md13
-rw-r--r--rust/src/bin/fatcat-export.rs89
4 files changed, 136 insertions, 24 deletions
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index 2a7d6017..a69f3d04 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -321,6 +321,18 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "env_logger"
+version = "0.5.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "atty 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)",
+ "humantime 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "error"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -356,6 +368,7 @@ dependencies = [
"diesel 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"diesel_migrations 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"dotenv 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "env_logger 0.5.12 (registry+https://github.com/rust-lang/crates.io-index)",
"error-chain 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
"fatcat-api-spec 0.1.0",
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -440,6 +453,14 @@ version = "1.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "humantime"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "hyper"
version = "0.10.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -864,6 +885,11 @@ dependencies = [
]
[[package]]
+name = "quick-error"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
name = "quote"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1201,6 +1227,14 @@ dependencies = [
]
[[package]]
+name = "termcolor"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "termion"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1432,10 +1466,27 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "winapi-util"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
+[[package]]
+name = "wincolor"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
[metadata]
"checksum aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "d6531d44de723825aa81398a6415283229725a00fa30713812ab9323faa82fc4"
"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
@@ -1476,6 +1527,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
+"checksum env_logger 0.5.12 (registry+https://github.com/rust-lang/crates.io-index)" = "f4d7e69c283751083d53d01eac767407343b8b69c4bd70058e08adc2637cb257"
"checksum error 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "a6e606f14042bb87cc02ef6a14db6c90ab92ed6f62d87e69377bc759fd7987cc"
"checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3"
"checksum error-chain 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "07e791d3be96241c77c43846b665ef1384606da2cd2a48730abe606a12906e02"
@@ -1485,6 +1537,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
"checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c"
"checksum httparse 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "c2f407128745b78abc95c0ffbe4e5d37427fdc0d45470710cfef8c44522a2e37"
+"checksum humantime 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0484fda3e7007f2a4a0d9c3a703ca38c71c54c55602ce4660c419fd32e188c9e"
"checksum hyper 0.10.13 (registry+https://github.com/rust-lang/crates.io-index)" = "368cb56b2740ebf4230520e2b90ebb0461e69034d85d1945febd9b3971426db2"
"checksum hyper-openssl 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "5ecb3cd8e4d53f8abe7cb2227e66674bb63c1bd0ba60ca9ba7b74ea1e0054891"
"checksum idna 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "014b298351066f1512874135335d62a789ffe78a9974f94b43ed5621951eaf7d"
@@ -1533,6 +1586,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum plugin 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1a6a0dc3910bc8db877ffed8e457763b317cf880df4ae19109b9f77d277cf6e0"
"checksum pq-sys 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "9323a6ce484fc41174d40f80ba87af6247f86a7ba57856af68d3aa0c8642d2f0"
"checksum proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1b06e2f335f48d24442b35a19df506a835fb3547bc3c06ef27340da9acf5cae7"
+"checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0"
"checksum quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a"
"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8"
"checksum r2d2 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f9078ca6a8a5568ed142083bb2f7dc9295b69d16f867ddcc9849e51b17d8db46"
@@ -1576,6 +1630,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum take_mut 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
"checksum tempdir 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8"
"checksum term 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5e6b677dd1e8214ea1ef4297f85dbcbed8e8cdddb561040cc998ca2551c37561"
+"checksum termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ff3bac0e465b59f194e7037ed404b0326e56ff234d767edc4c5cc9cd49e7a2c7"
"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096"
"checksum textwrap 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c0b59b6b4b44d867f1370ef1bd91bfb262bf07bf0ae65c202ea2fbc16153b693"
"checksum thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279ef31c19ededf577bfd12dfae728040a21f635b06a24cd670ff510edd38963"
@@ -1608,4 +1663,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+"checksum winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "afc5508759c5bf4285e61feb862b6083c8480aec864fa17a81fdec6f69b461ab"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+"checksum wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "561ed901ae465d6185fa7864d63fbd5720d0ef718366c9a4dc83cf6170d7e9ba"
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 7c6695d5..f9a439db 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -38,6 +38,7 @@ serde_json = "1.0"
# Command-line tools
crossbeam-channel = "0.2"
num_cpus = "1"
+env_logger = "*"
# Unused (hyper 0.11 and https)
#hyper-openssl = {version = "0.2", optional = true}
diff --git a/rust/README.export.md b/rust/README.export.md
new file mode 100644
index 00000000..7301a4db
--- /dev/null
+++ b/rust/README.export.md
@@ -0,0 +1,13 @@
+
+First create ident files:
+
+ psql fatcat < ../extra/quick_dump.sql
+
+Then dump:
+
+ cat /tmp/fatcat_ident_releases.tsv | ./target/debug/fatcat-export releases
+
+Or, perhaps, in production:
+
+ cat /tmp/fatcat_ident_releases.tsv | ./target/release/fatcat-export release --expand files,contaner -j8 | pv | gzip > all_releases.json.gz
+
diff --git a/rust/src/bin/fatcat-export.rs b/rust/src/bin/fatcat-export.rs
index 6f9b92cf..1af321ba 100644
--- a/rust/src/bin/fatcat-export.rs
+++ b/rust/src/bin/fatcat-export.rs
@@ -9,16 +9,14 @@ extern crate error_chain;
extern crate fatcat;
extern crate fatcat_api_spec;
#[macro_use]
-extern crate slog;
-extern crate slog_async;
-extern crate slog_term;
+extern crate log;
+extern crate env_logger;
extern crate uuid;
extern crate crossbeam_channel;
extern crate serde_json;
extern crate num_cpus;
use clap::{App, Arg};
-use slog::{Drain, Logger};
use dotenv::dotenv;
use std::env;
@@ -32,6 +30,7 @@ use fatcat::api_helpers::*;
use fatcat::api_entity_crud::*;
use fatcat::errors::*;
+use error_chain::ChainedError;
use std::thread;
//use std::io::{Stdout,StdoutLock};
use crossbeam_channel as channel;
@@ -71,18 +70,26 @@ pub fn database_worker_pool() -> Result<ConnectionPool> {
macro_rules! generic_loop_work {
($fn_name:ident, $entity_model:ident) => {
- fn $fn_name(row_receiver: channel::Receiver<IdentRow>, output_sender: channel::Sender<String>, db_conn: &DbConn, expand: Option<ExpandFlags>) -> Result<()> {
- for row in row_receiver {
- let mut entity = $entity_model::db_get_rev(db_conn, row.rev_id.expect("valid, non-deleted row"))?;
- //let mut entity = ReleaseEntity::db_get_rev(db_conn, row.rev_id.expect("valid, non-deleted row"))?;
- entity.state = Some("active".to_string()); // XXX
- entity.ident = Some(row.ident_id.to_string());
- if let Some(expand) = expand {
- entity.db_expand(db_conn, expand)?;
+ fn $fn_name(row_receiver: channel::Receiver<IdentRow>, output_sender: channel::Sender<String>, db_conn: &DbConn, expand: Option<ExpandFlags>) {
+ let result: Result<()> = (|| {
+ for row in row_receiver {
+ let mut entity = $entity_model::db_get_rev(db_conn, row.rev_id.expect("valid, non-deleted row"))
+ .chain_err(|| "reading entity from database")?;
+ //let mut entity = ReleaseEntity::db_get_rev(db_conn, row.rev_id.expect("valid, non-deleted row"))?;
+ entity.state = Some("active".to_string()); // XXX
+ entity.ident = Some(row.ident_id.to_string());
+ if let Some(expand) = expand {
+ entity.db_expand(db_conn, expand)
+ .chain_err(|| "expanding sub-entities from database")?;
+ }
+ output_sender.send(serde_json::to_string(&entity)?);
}
- output_sender.send(serde_json::to_string(&entity)?);
+ Ok(())
+ })();
+ if let Err(ref e) = result {
+ error!("{}", e.display_chain())
}
- Ok(())
+ result.unwrap()
}
}
}
@@ -114,14 +121,23 @@ fn parse_line(s: String) -> Result<IdentRow> {
}
Ok(IdentRow {
ident_id: FatCatId::from_uuid(&Uuid::from_str(&fields[0])?),
- rev_id: Some(Uuid::from_str(&fields[1])?),
- redirect_id: None,
+ rev_id: match fields[1].as_ref() {
+ "\\N" => None,
+ val => Some(Uuid::from_str(&val)?),
+ },
+ redirect_id: match fields[2].as_ref() {
+ "\\N" => None,
+ val => Some(FatCatId::from_uuid(&Uuid::from_str(&val)?)),
+ },
})
}
#[test]
fn test_parse_line() {
- assert!(false)
+ assert!(parse_line("00000000-0000-0000-3333-000000000001\t00000000-0000-0000-3333-fff000000001\t\\N").is_ok());
+ assert!(parse_line("00000-0000-0000-3333-000000000001\t00000000-0000-0000-3333-fff000000001\t\\N").is_err());
+ assert!(parse_line("00000-0000-0000-3333-000000000001\t00000000-0000-0000-3333-fff000000001").is_err());
+ assert!(parse_line("00000000-0000-0000-3333-000000000002\t00000000-0000-0000-3333-fff000000001\t00000000-0000-0000-3333-000000000001").is_ok());
}
// Use num_cpus/2, or CLI arg for worker count
@@ -135,7 +151,7 @@ fn test_parse_line() {
// 4. read rows, pushing to row channel
// => every N lines, log to stderr
// 5. wait for all channels to finish, and quit
-pub fn do_export(num_workers: usize, expand: Option<ExpandFlags>, entity_type: ExportEntityType) -> Result<()> {
+pub fn do_export(num_workers: usize, expand: Option<ExpandFlags>, entity_type: ExportEntityType, redirects: bool) -> Result<()> {
let db_pool = database_worker_pool()?;
let buf_input = BufReader::new(std::io::stdin());
@@ -143,6 +159,8 @@ pub fn do_export(num_workers: usize, expand: Option<ExpandFlags>, entity_type: E
let (output_sender, output_receiver) = channel::bounded(CHANNEL_BUFFER_LEN);
let (done_sender, done_receiver) = channel::bounded(0);
+ info!("Starting an export of {} entities", entity_type);
+
// Start row worker threads
assert!(num_workers > 0);
for _ in 0..num_workers {
@@ -164,15 +182,25 @@ pub fn do_export(num_workers: usize, expand: Option<ExpandFlags>, entity_type: E
}
drop(output_sender);
// Start printer thread
- thread::spawn(move || loop_printer(output_receiver, done_sender));
+ thread::spawn(move || loop_printer(output_receiver, done_sender).expect("printing to stdout") );
+ let mut count = 0;
for line in buf_input.lines() {
let line = line?;
let row = parse_line(line)?;
- row_sender.send(row);
+ match (row.rev_id, row.redirect_id, redirects) {
+ (None, _, _) => (),
+ (Some(_), Some(_), false) => (),
+ _ => row_sender.send(row),
+ }
+ count += 1;
+ if count % 1000 == 0 {
+ info!("processed {} lines...", count);
+ }
}
drop(row_sender);
done_receiver.recv();
+ info!("Done reading ({} lines), waiting for workers to exit...", count);
Ok(())
}
@@ -182,28 +210,41 @@ fn run() -> Result<()> {
.version(env!("CARGO_PKG_VERSION"))
.author("Bryan Newbold <bnewbold@archive.org>")
.about("Fast exports of database entities from an id list")
+ .arg(Arg::from_usage("<entity_type> 'what entity type the idents correspond to'")
+ .possible_values(&ExportEntityType::variants())
+ .case_insensitive(true))
.args_from_usage(
- "-f --workers=[workers] 'number of threads (database connections) to use'
- --expand=[expand] 'sub-entities to include in dump'
- <entity_type> 'what entity type the idents correspond to'")
+ "-j --workers=[workers] 'number of threads (database connections) to use'
+ -q --quiet 'less status output to stderr'
+ --include-redirects 'include redirected idents (normally skipped)'
+ --expand=[expand] 'sub-entities to include in dump'")
.after_help("Reads a ident table TSV dump from stdin (aka, ident_id, rev_id, redirect_id), \
and outputs JSON (one entity per line). Database connection info read from environment \
(DATABASE_URL, same as fatcatd).")
.get_matches();
let num_workers: usize = match m.value_of("workers") {
- Some(v) => value_t_or_exit!(m.value_of("workers"), usize),
+ Some(_) => value_t_or_exit!(m.value_of("workers"), usize),
None => std::cmp::min(1, num_cpus::get() / 2) as usize,
};
let expand = match m.value_of("expand") {
Some(s) => Some(ExpandFlags::from_str(&s)?),
None => None,
};
+ let log_level = if m.is_present("quiet") {
+ "warn"
+ } else {
+ "info"
+ };
+ let env = env_logger::Env::default()
+ .filter_or(env_logger::DEFAULT_FILTER_ENV, log_level);
+ env_logger::Builder::from_env(env).init();
do_export(
num_workers,
expand,
value_t_or_exit!(m.value_of("entity_type"), ExportEntityType),
+ m.is_present("include_redirects"),
)
}