From 7463049b621f7729b48c5e06429767118c1b8506 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 1 Feb 2019 15:10:34 -0800 Subject: update dump and sort commands Pipeline sorts are *so* starved and slow ; they only get a few MByte of RAM by default! --- extra/sql_dumps/README.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'extra/sql_dumps') diff --git a/extra/sql_dumps/README.md b/extra/sql_dumps/README.md index 3538ce8f..45adae09 100644 --- a/extra/sql_dumps/README.md +++ b/extra/sql_dumps/README.md @@ -1,8 +1,6 @@ ## HOWTO: Ident Table Snapshots -How to take a consistent (single transaction) snapshot of - This will take somewhere around 15-25 GB of disk space on the database server (under /tmp). It would probably be better to stream this transaction over a network connection (saving database disk I/O), but I can't figure out how to do @@ -11,6 +9,11 @@ to be a custom client. ./ident_table_snapshot.sh +Or, in production: + + sudo su postgres + DATABASE_URL=fatcat_prod ./ident_table_snapshot.sh /tmp + ## HOWTO: Dump abstracts, release identifiers, file hashes, etc These are run as regular old commands, and can run across the network in a @@ -24,10 +27,16 @@ forwarding anyways. # Run on database server, write to file on remote host psql fatcat < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip | ssh user@host 'cat > abstracts.json.gz' +In production: + + sudo -u postgres psql fatcat_prod < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip > /srv/fatcat/snapshots/abstracts.json.gz + sudo -u postgres psql fatcat_prod < dump_file_hashes.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip > /srv/fatcat/snapshots/file_hashes.tsv.gz + sudo -u postgres psql fatcat_prod < dump_release_extid.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip > /srv/fatcat/snapshots/release_extid.tsv.gz + ## HOWTO: Full private database backup and restore export DATESLUG="`date +%Y-%m-%d.%H%M%S`" - sudo -u postgres pg_dump --verbose --format=tar fatcat_prod | gzip > /srv/fatcat/snapshots/fatcat_private_dbdump_${DATESLUG}.tar.gz + time sudo -u postgres pg_dump --verbose --format=tar fatcat_prod | gzip > /srv/fatcat/snapshots/fatcat_private_dbdump_${DATESLUG}.tar.gz NOTE: by using the "directory" export (along with `--file`) instead of "tar" export, it would be possible to use parallel dumping. However, this would put @@ -48,6 +57,7 @@ This dump will contain all tables in the backend schema, except for "private" authentication tables. For local or non-production machines, might need to replace the `fatcat_prod` database name. + # TODO: for production, probably want consistent serialization mode export DATESLUG="`date +%Y-%m-%d.%H%M%S`" sudo -u postgres pg_dump --verbose --format=tar --exclude-table-data=auth_oidc fatcat_prod | gzip > /srv/fatcat/snapshots/fatcat_public_dbdump_${DATESLUG}.tar.gz -- cgit v1.2.3