From 9f25d84accb5a3657cb4c7dd87014d9f13ccf2ef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 13 Sep 2018 00:23:09 -0700 Subject: improve dump scripts --- extra/sql_dumps/README.md | 26 ++++++++++++++++++++++++++ extra/sql_dumps/abstracts.json.gz | Bin 0 -> 338 bytes extra/sql_dumps/dump_abstracts.sql | 6 ++++++ extra/sql_dumps/dump_file_hashes.sql | 11 +++++++++++ extra/sql_dumps/dump_idents.sql | 12 ++++++++++++ extra/sql_dumps/dump_release_extid.sql | 12 ++++++++++++ extra/sql_dumps/ident_table_snapshot.sh | 24 ++++++++++++++++++++++++ extra/sql_dumps/thing.txt | 5 +++++ 8 files changed, 96 insertions(+) create mode 100644 extra/sql_dumps/README.md create mode 100644 extra/sql_dumps/abstracts.json.gz create mode 100644 extra/sql_dumps/dump_abstracts.sql create mode 100644 extra/sql_dumps/dump_file_hashes.sql create mode 100644 extra/sql_dumps/dump_idents.sql create mode 100644 extra/sql_dumps/dump_release_extid.sql create mode 100755 extra/sql_dumps/ident_table_snapshot.sh create mode 100644 extra/sql_dumps/thing.txt (limited to 'extra/sql_dumps') diff --git a/extra/sql_dumps/README.md b/extra/sql_dumps/README.md new file mode 100644 index 00000000..6f24207d --- /dev/null +++ b/extra/sql_dumps/README.md @@ -0,0 +1,26 @@ + +## HOWTO: Ident Table Snapshots + +How to take a consistent (single transaction) snapshot of + +This will take somewhere around 15-25 GB of disk space on the database server +(under /tmp). It would probably be better to stream this transaction over a +network connection (saving database disk I/O), but I can't figure out how to do +that with plain SQL (multiple table dumps in a single session), so would need +to be a custom client. + + ./ident_table_snapshot.sh + +## HOWTO: Dump abstracts, release identifiers, file hashes, etc + +These are run as regular old commands, and can run across the network in a +couple different ways. We might not want database ports open to the network +(even cluster/VPN); on the other hand we could proabably do SSH port +forwarding anyways. + + # Locally, or client running on a remote machine + psql fatcat < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip > abstracts.json.gz + + # Run on database server, write to file on remote host + psql fatcat < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip | ssh user@host 'cat > abstracts.json.gz' + diff --git a/extra/sql_dumps/abstracts.json.gz b/extra/sql_dumps/abstracts.json.gz new file mode 100644 index 00000000..2b6e3dc2 Binary files /dev/null and b/extra/sql_dumps/abstracts.json.gz differ diff --git a/extra/sql_dumps/dump_abstracts.sql b/extra/sql_dumps/dump_abstracts.sql new file mode 100644 index 00000000..9ce369ce --- /dev/null +++ b/extra/sql_dumps/dump_abstracts.sql @@ -0,0 +1,6 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT row_to_json(abstracts) FROM abstracts) TO STDOUT WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/dump_file_hashes.sql b/extra/sql_dumps/dump_file_hashes.sql new file mode 100644 index 00000000..a84c2ee1 --- /dev/null +++ b/extra/sql_dumps/dump_file_hashes.sql @@ -0,0 +1,11 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT file_ident.id, file_rev.id, file_rev.sha1, file_rev.sha256, file_rev.md5 + FROM file_rev + INNER JOIN file_ident ON file_ident.rev_id = file_rev.id + WHERE file_ident.is_live = 't' AND file_ident.redirect_id IS NULL) + TO STDOUT + WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/dump_idents.sql b/extra/sql_dumps/dump_idents.sql new file mode 100644 index 00000000..22640cbc --- /dev/null +++ b/extra/sql_dumps/dump_idents.sql @@ -0,0 +1,12 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT id FROM changelog ORDER BY id DESC LIMIT 1) TO '/tmp/fatcat_ident_latest_changelog.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM creator_ident WHERE is_live=true) TO '/tmp/fatcat_ident_creators.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM container_ident WHERE is_live=true) TO '/tmp/fatcat_ident_containers.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM file_ident WHERE is_live=true) TO '/tmp/fatcat_ident_files.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM release_ident WHERE is_live=true) TO '/tmp/fatcat_ident_releases.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM work_ident WHERE is_live=true) TO '/tmp/fatcat_ident_works.tsv' WITH NULL ''; +COPY (SELECT id, editgroup_id, timestamp FROM changelog) TO '/tmp/fatcat_ident_changelog.tsv' WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/dump_release_extid.sql b/extra/sql_dumps/dump_release_extid.sql new file mode 100644 index 00000000..5e93cb1d --- /dev/null +++ b/extra/sql_dumps/dump_release_extid.sql @@ -0,0 +1,12 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT release_ident.id, release_rev.id, release_rev.doi, release_rev.pmcid, release_rev.pmid, + release_rev.core_id, release_rev.wikidata_qid + FROM release_rev + INNER JOIN release_ident ON release_ident.rev_id = release_rev.id + WHERE release_ident.is_live = 't' AND release_ident.redirect_id IS NULL) + TO STDOUT + WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/ident_table_snapshot.sh b/extra/sql_dumps/ident_table_snapshot.sh new file mode 100755 index 00000000..b887d796 --- /dev/null +++ b/extra/sql_dumps/ident_table_snapshot.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e -u -o pipefail + +DATESLUG="`date +%Y-%m-%d.%H%M%S`" +DATABASE="fatcat" + +echo "Running SQL..." +psql fatcat < ./dump_idents.sql + +CHANGELOG_REV="`head -n1 /tmp/fatcat_ident_latest_changelog.tsv`" +OUTFILE="fatcat_idents.$DATESLUG.r$CHANGELOG_REV.tar.xz" + +echo "Compressing..." +tar -C /tmp -c --xz --verbose \ + -f $OUTFILE \ + fatcat_ident_latest_changelog.tsv \ + fatcat_ident_containers.tsv \ + fatcat_ident_creators.tsv \ + fatcat_ident_files.tsv \ + fatcat_ident_releases.tsv \ + fatcat_ident_works.tsv + +echo "Done: $OUTFILE" diff --git a/extra/sql_dumps/thing.txt b/extra/sql_dumps/thing.txt new file mode 100644 index 00000000..e5b7672e --- /dev/null +++ b/extra/sql_dumps/thing.txt @@ -0,0 +1,5 @@ +BEGIN +{"sha1":"1ba86bf8c2979a62d29b18b537e50b2b093be27e","content":"some long abstract in plain text"} +{"sha1":"0da908ab584b5e445a06beb172e3fab8cb5169e3","content":"A longer, more correct abstract should in theory go here"} +{"sha1":"bd864bf5c612e64473e6b724c1d8f5e53a56bb6e","content":"This brief meeting review summarizes the recommendations of NSF and NPGI funded bioinformaticians concerning the future requirements for plant bioinformatics systems and databases."} +ROLLBACK -- cgit v1.2.3