diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-13 00:23:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-13 00:23:09 -0700 |
commit | 9f25d84accb5a3657cb4c7dd87014d9f13ccf2ef (patch) | |
tree | 4854b13512c65fd0bffddfb791c4c014a41088dc | |
parent | aa9abbbab67c6344d382a964b3c451e0bf212efe (diff) | |
download | fatcat-9f25d84accb5a3657cb4c7dd87014d9f13ccf2ef.tar.gz fatcat-9f25d84accb5a3657cb4c7dd87014d9f13ccf2ef.zip |
improve dump scripts
-rw-r--r-- | extra/quick_dump.sql | 12 | ||||
-rw-r--r-- | extra/sql_dumps/README.md | 26 | ||||
-rw-r--r-- | extra/sql_dumps/abstracts.json.gz | bin | 0 -> 338 bytes | |||
-rw-r--r-- | extra/sql_dumps/dump_abstracts.sql | 6 | ||||
-rw-r--r-- | extra/sql_dumps/dump_file_hashes.sql | 11 | ||||
-rw-r--r-- | extra/sql_dumps/dump_idents.sql | 12 | ||||
-rw-r--r-- | extra/sql_dumps/dump_release_extid.sql | 12 | ||||
-rwxr-xr-x | extra/sql_dumps/ident_table_snapshot.sh | 24 | ||||
-rw-r--r-- | extra/sql_dumps/thing.txt | 5 |
9 files changed, 96 insertions, 12 deletions
diff --git a/extra/quick_dump.sql b/extra/quick_dump.sql deleted file mode 100644 index d2d12bb4..00000000 --- a/extra/quick_dump.sql +++ /dev/null @@ -1,12 +0,0 @@ - -BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; - -COPY (SELECT id, rev_id, redirect_id FROM creator_ident WHERE is_live=true) TO '/tmp/fatcat_ident_creators.tsv'; -COPY (SELECT id, rev_id, redirect_id FROM container_ident WHERE is_live=true) TO '/tmp/fatcat_ident_containers.tsv'; -COPY (SELECT id, rev_id, redirect_id FROM file_ident WHERE is_live=true) TO '/tmp/fatcat_ident_files.tsv'; -COPY (SELECT id, rev_id, redirect_id FROM release_ident WHERE is_live=true) TO '/tmp/fatcat_ident_releases.tsv'; -COPY (SELECT id, rev_id, redirect_id FROM work_ident WHERE is_live=true) TO '/tmp/fatcat_ident_works.tsv'; - -COPY (SELECT row_to_json(abstracts) FROM abstracts) TO '/tmp/fatcat_abstracts.json'; - -ROLLBACK; diff --git a/extra/sql_dumps/README.md b/extra/sql_dumps/README.md new file mode 100644 index 00000000..6f24207d --- /dev/null +++ b/extra/sql_dumps/README.md @@ -0,0 +1,26 @@ + +## HOWTO: Ident Table Snapshots + +How to take a consistent (single transaction) snapshot of + +This will take somewhere around 15-25 GB of disk space on the database server +(under /tmp). It would probably be better to stream this transaction over a +network connection (saving database disk I/O), but I can't figure out how to do +that with plain SQL (multiple table dumps in a single session), so would need +to be a custom client. + + ./ident_table_snapshot.sh + +## HOWTO: Dump abstracts, release identifiers, file hashes, etc + +These are run as regular old commands, and can run across the network in a +couple different ways. We might not want database ports open to the network +(even cluster/VPN); on the other hand we could proabably do SSH port +forwarding anyways. + + # Locally, or client running on a remote machine + psql fatcat < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip > abstracts.json.gz + + # Run on database server, write to file on remote host + psql fatcat < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip | ssh user@host 'cat > abstracts.json.gz' + diff --git a/extra/sql_dumps/abstracts.json.gz b/extra/sql_dumps/abstracts.json.gz Binary files differnew file mode 100644 index 00000000..2b6e3dc2 --- /dev/null +++ b/extra/sql_dumps/abstracts.json.gz diff --git a/extra/sql_dumps/dump_abstracts.sql b/extra/sql_dumps/dump_abstracts.sql new file mode 100644 index 00000000..9ce369ce --- /dev/null +++ b/extra/sql_dumps/dump_abstracts.sql @@ -0,0 +1,6 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT row_to_json(abstracts) FROM abstracts) TO STDOUT WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/dump_file_hashes.sql b/extra/sql_dumps/dump_file_hashes.sql new file mode 100644 index 00000000..a84c2ee1 --- /dev/null +++ b/extra/sql_dumps/dump_file_hashes.sql @@ -0,0 +1,11 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT file_ident.id, file_rev.id, file_rev.sha1, file_rev.sha256, file_rev.md5 + FROM file_rev + INNER JOIN file_ident ON file_ident.rev_id = file_rev.id + WHERE file_ident.is_live = 't' AND file_ident.redirect_id IS NULL) + TO STDOUT + WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/dump_idents.sql b/extra/sql_dumps/dump_idents.sql new file mode 100644 index 00000000..22640cbc --- /dev/null +++ b/extra/sql_dumps/dump_idents.sql @@ -0,0 +1,12 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT id FROM changelog ORDER BY id DESC LIMIT 1) TO '/tmp/fatcat_ident_latest_changelog.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM creator_ident WHERE is_live=true) TO '/tmp/fatcat_ident_creators.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM container_ident WHERE is_live=true) TO '/tmp/fatcat_ident_containers.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM file_ident WHERE is_live=true) TO '/tmp/fatcat_ident_files.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM release_ident WHERE is_live=true) TO '/tmp/fatcat_ident_releases.tsv' WITH NULL ''; +COPY (SELECT id, rev_id, redirect_id FROM work_ident WHERE is_live=true) TO '/tmp/fatcat_ident_works.tsv' WITH NULL ''; +COPY (SELECT id, editgroup_id, timestamp FROM changelog) TO '/tmp/fatcat_ident_changelog.tsv' WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/dump_release_extid.sql b/extra/sql_dumps/dump_release_extid.sql new file mode 100644 index 00000000..5e93cb1d --- /dev/null +++ b/extra/sql_dumps/dump_release_extid.sql @@ -0,0 +1,12 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT release_ident.id, release_rev.id, release_rev.doi, release_rev.pmcid, release_rev.pmid, + release_rev.core_id, release_rev.wikidata_qid + FROM release_rev + INNER JOIN release_ident ON release_ident.rev_id = release_rev.id + WHERE release_ident.is_live = 't' AND release_ident.redirect_id IS NULL) + TO STDOUT + WITH NULL ''; + +ROLLBACK; diff --git a/extra/sql_dumps/ident_table_snapshot.sh b/extra/sql_dumps/ident_table_snapshot.sh new file mode 100755 index 00000000..b887d796 --- /dev/null +++ b/extra/sql_dumps/ident_table_snapshot.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e -u -o pipefail + +DATESLUG="`date +%Y-%m-%d.%H%M%S`" +DATABASE="fatcat" + +echo "Running SQL..." +psql fatcat < ./dump_idents.sql + +CHANGELOG_REV="`head -n1 /tmp/fatcat_ident_latest_changelog.tsv`" +OUTFILE="fatcat_idents.$DATESLUG.r$CHANGELOG_REV.tar.xz" + +echo "Compressing..." +tar -C /tmp -c --xz --verbose \ + -f $OUTFILE \ + fatcat_ident_latest_changelog.tsv \ + fatcat_ident_containers.tsv \ + fatcat_ident_creators.tsv \ + fatcat_ident_files.tsv \ + fatcat_ident_releases.tsv \ + fatcat_ident_works.tsv + +echo "Done: $OUTFILE" diff --git a/extra/sql_dumps/thing.txt b/extra/sql_dumps/thing.txt new file mode 100644 index 00000000..e5b7672e --- /dev/null +++ b/extra/sql_dumps/thing.txt @@ -0,0 +1,5 @@ +BEGIN +{"sha1":"1ba86bf8c2979a62d29b18b537e50b2b093be27e","content":"some long abstract in plain text"} +{"sha1":"0da908ab584b5e445a06beb172e3fab8cb5169e3","content":"<jats>A longer, more correct abstract should in theory go here</jats>"} +{"sha1":"bd864bf5c612e64473e6b724c1d8f5e53a56bb6e","content":"<jats:p>This brief meeting review summarizes the recommendations of NSF and NPGI funded bioinformaticians concerning the future requirements for plant bioinformatics systems and databases.</jats:p>"} +ROLLBACK |