summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-13 00:23:09 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-13 00:23:09 -0700
commit9f25d84accb5a3657cb4c7dd87014d9f13ccf2ef (patch)
tree4854b13512c65fd0bffddfb791c4c014a41088dc
parentaa9abbbab67c6344d382a964b3c451e0bf212efe (diff)
downloadfatcat-9f25d84accb5a3657cb4c7dd87014d9f13ccf2ef.tar.gz
fatcat-9f25d84accb5a3657cb4c7dd87014d9f13ccf2ef.zip
improve dump scripts
-rw-r--r--extra/quick_dump.sql12
-rw-r--r--extra/sql_dumps/README.md26
-rw-r--r--extra/sql_dumps/abstracts.json.gzbin0 -> 338 bytes
-rw-r--r--extra/sql_dumps/dump_abstracts.sql6
-rw-r--r--extra/sql_dumps/dump_file_hashes.sql11
-rw-r--r--extra/sql_dumps/dump_idents.sql12
-rw-r--r--extra/sql_dumps/dump_release_extid.sql12
-rwxr-xr-xextra/sql_dumps/ident_table_snapshot.sh24
-rw-r--r--extra/sql_dumps/thing.txt5
9 files changed, 96 insertions, 12 deletions
diff --git a/extra/quick_dump.sql b/extra/quick_dump.sql
deleted file mode 100644
index d2d12bb4..00000000
--- a/extra/quick_dump.sql
+++ /dev/null
@@ -1,12 +0,0 @@
-
-BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
-
-COPY (SELECT id, rev_id, redirect_id FROM creator_ident WHERE is_live=true) TO '/tmp/fatcat_ident_creators.tsv';
-COPY (SELECT id, rev_id, redirect_id FROM container_ident WHERE is_live=true) TO '/tmp/fatcat_ident_containers.tsv';
-COPY (SELECT id, rev_id, redirect_id FROM file_ident WHERE is_live=true) TO '/tmp/fatcat_ident_files.tsv';
-COPY (SELECT id, rev_id, redirect_id FROM release_ident WHERE is_live=true) TO '/tmp/fatcat_ident_releases.tsv';
-COPY (SELECT id, rev_id, redirect_id FROM work_ident WHERE is_live=true) TO '/tmp/fatcat_ident_works.tsv';
-
-COPY (SELECT row_to_json(abstracts) FROM abstracts) TO '/tmp/fatcat_abstracts.json';
-
-ROLLBACK;
diff --git a/extra/sql_dumps/README.md b/extra/sql_dumps/README.md
new file mode 100644
index 00000000..6f24207d
--- /dev/null
+++ b/extra/sql_dumps/README.md
@@ -0,0 +1,26 @@
+
+## HOWTO: Ident Table Snapshots
+
+How to take a consistent (single transaction) snapshot of
+
+This will take somewhere around 15-25 GB of disk space on the database server
+(under /tmp). It would probably be better to stream this transaction over a
+network connection (saving database disk I/O), but I can't figure out how to do
+that with plain SQL (multiple table dumps in a single session), so would need
+to be a custom client.
+
+ ./ident_table_snapshot.sh
+
+## HOWTO: Dump abstracts, release identifiers, file hashes, etc
+
+These are run as regular old commands, and can run across the network in a
+couple different ways. We might not want database ports open to the network
+(even cluster/VPN); on the other hand we could proabably do SSH port
+forwarding anyways.
+
+ # Locally, or client running on a remote machine
+ psql fatcat < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip > abstracts.json.gz
+
+ # Run on database server, write to file on remote host
+ psql fatcat < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | gzip | ssh user@host 'cat > abstracts.json.gz'
+
diff --git a/extra/sql_dumps/abstracts.json.gz b/extra/sql_dumps/abstracts.json.gz
new file mode 100644
index 00000000..2b6e3dc2
--- /dev/null
+++ b/extra/sql_dumps/abstracts.json.gz
Binary files differ
diff --git a/extra/sql_dumps/dump_abstracts.sql b/extra/sql_dumps/dump_abstracts.sql
new file mode 100644
index 00000000..9ce369ce
--- /dev/null
+++ b/extra/sql_dumps/dump_abstracts.sql
@@ -0,0 +1,6 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (SELECT row_to_json(abstracts) FROM abstracts) TO STDOUT WITH NULL '';
+
+ROLLBACK;
diff --git a/extra/sql_dumps/dump_file_hashes.sql b/extra/sql_dumps/dump_file_hashes.sql
new file mode 100644
index 00000000..a84c2ee1
--- /dev/null
+++ b/extra/sql_dumps/dump_file_hashes.sql
@@ -0,0 +1,11 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (SELECT file_ident.id, file_rev.id, file_rev.sha1, file_rev.sha256, file_rev.md5
+ FROM file_rev
+ INNER JOIN file_ident ON file_ident.rev_id = file_rev.id
+ WHERE file_ident.is_live = 't' AND file_ident.redirect_id IS NULL)
+ TO STDOUT
+ WITH NULL '';
+
+ROLLBACK;
diff --git a/extra/sql_dumps/dump_idents.sql b/extra/sql_dumps/dump_idents.sql
new file mode 100644
index 00000000..22640cbc
--- /dev/null
+++ b/extra/sql_dumps/dump_idents.sql
@@ -0,0 +1,12 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (SELECT id FROM changelog ORDER BY id DESC LIMIT 1) TO '/tmp/fatcat_ident_latest_changelog.tsv' WITH NULL '';
+COPY (SELECT id, rev_id, redirect_id FROM creator_ident WHERE is_live=true) TO '/tmp/fatcat_ident_creators.tsv' WITH NULL '';
+COPY (SELECT id, rev_id, redirect_id FROM container_ident WHERE is_live=true) TO '/tmp/fatcat_ident_containers.tsv' WITH NULL '';
+COPY (SELECT id, rev_id, redirect_id FROM file_ident WHERE is_live=true) TO '/tmp/fatcat_ident_files.tsv' WITH NULL '';
+COPY (SELECT id, rev_id, redirect_id FROM release_ident WHERE is_live=true) TO '/tmp/fatcat_ident_releases.tsv' WITH NULL '';
+COPY (SELECT id, rev_id, redirect_id FROM work_ident WHERE is_live=true) TO '/tmp/fatcat_ident_works.tsv' WITH NULL '';
+COPY (SELECT id, editgroup_id, timestamp FROM changelog) TO '/tmp/fatcat_ident_changelog.tsv' WITH NULL '';
+
+ROLLBACK;
diff --git a/extra/sql_dumps/dump_release_extid.sql b/extra/sql_dumps/dump_release_extid.sql
new file mode 100644
index 00000000..5e93cb1d
--- /dev/null
+++ b/extra/sql_dumps/dump_release_extid.sql
@@ -0,0 +1,12 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (SELECT release_ident.id, release_rev.id, release_rev.doi, release_rev.pmcid, release_rev.pmid,
+ release_rev.core_id, release_rev.wikidata_qid
+ FROM release_rev
+ INNER JOIN release_ident ON release_ident.rev_id = release_rev.id
+ WHERE release_ident.is_live = 't' AND release_ident.redirect_id IS NULL)
+ TO STDOUT
+ WITH NULL '';
+
+ROLLBACK;
diff --git a/extra/sql_dumps/ident_table_snapshot.sh b/extra/sql_dumps/ident_table_snapshot.sh
new file mode 100755
index 00000000..b887d796
--- /dev/null
+++ b/extra/sql_dumps/ident_table_snapshot.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -e -u -o pipefail
+
+DATESLUG="`date +%Y-%m-%d.%H%M%S`"
+DATABASE="fatcat"
+
+echo "Running SQL..."
+psql fatcat < ./dump_idents.sql
+
+CHANGELOG_REV="`head -n1 /tmp/fatcat_ident_latest_changelog.tsv`"
+OUTFILE="fatcat_idents.$DATESLUG.r$CHANGELOG_REV.tar.xz"
+
+echo "Compressing..."
+tar -C /tmp -c --xz --verbose \
+ -f $OUTFILE \
+ fatcat_ident_latest_changelog.tsv \
+ fatcat_ident_containers.tsv \
+ fatcat_ident_creators.tsv \
+ fatcat_ident_files.tsv \
+ fatcat_ident_releases.tsv \
+ fatcat_ident_works.tsv
+
+echo "Done: $OUTFILE"
diff --git a/extra/sql_dumps/thing.txt b/extra/sql_dumps/thing.txt
new file mode 100644
index 00000000..e5b7672e
--- /dev/null
+++ b/extra/sql_dumps/thing.txt
@@ -0,0 +1,5 @@
+BEGIN
+{"sha1":"1ba86bf8c2979a62d29b18b537e50b2b093be27e","content":"some long abstract in plain text"}
+{"sha1":"0da908ab584b5e445a06beb172e3fab8cb5169e3","content":"<jats>A longer, more correct abstract should in theory go here</jats>"}
+{"sha1":"bd864bf5c612e64473e6b724c1d8f5e53a56bb6e","content":"<jats:p>This brief meeting review summarizes the recommendations of NSF and NPGI funded bioinformaticians concerning the future requirements for plant bioinformatics systems and databases.</jats:p>"}
+ROLLBACK