aboutsummaryrefslogtreecommitdiffstats
path: root/sql/reingest_terminalstatus_forcerecrawl.sh
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-10-03 10:16:26 -0700
committerBryan Newbold <bnewbold@archive.org>2022-10-03 10:16:26 -0700
commit54e14814080d9a706ff6f15694b3b54918200169 (patch)
treec2ce152acb64a365408a57728c18d960b954c1f5 /sql/reingest_terminalstatus_forcerecrawl.sh
parenta04468041cd81ad90aa76ec15788a5ffacb6eec2 (diff)
downloadsandcrawler-54e14814080d9a706ff6f15694b3b54918200169.tar.gz
sandcrawler-54e14814080d9a706ff6f15694b3b54918200169.zip
reingests: update scripts and SQL
Diffstat (limited to 'sql/reingest_terminalstatus_forcerecrawl.sh')
-rwxr-xr-xsql/reingest_terminalstatus_forcerecrawl.sh19
1 files changed, 19 insertions, 0 deletions
diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh
new file mode 100755
index 0000000..5cb6d51
--- /dev/null
+++ b/sql/reingest_terminalstatus_forcerecrawl.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json
+
+cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \
+ | shuf \
+ | head -n100000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+