diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:38:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:38:09 -0700 |
commit | 4a9fba8005e0a65c03198c674d2c65f7440d71a6 (patch) | |
tree | d424ca51049632386aaf5762c2b45685d304cd1f | |
parent | f1936476985231286ad1abc74318cc06e20e2627 (diff) | |
download | sandcrawler-4a9fba8005e0a65c03198c674d2c65f7440d71a6.tar.gz sandcrawler-4a9fba8005e0a65c03198c674d2c65f7440d71a6.zip |
SQL: update weekly/quarterly ingest retry scripts
-rw-r--r-- | sql/dump_reingest_quarterly.sql | 78 | ||||
-rw-r--r-- | sql/dump_reingest_weekly.sql (renamed from sql/dump_reingest.sql) | 15 | ||||
-rw-r--r-- | sql/dump_unextracted_pdf.sql | 4 | ||||
-rwxr-xr-x | sql/reingest_quarterly.sh | 20 | ||||
-rwxr-xr-x | sql/reingest_weekly.sh | 20 |
5 files changed, 119 insertions, 18 deletions
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql new file mode 100644 index 0000000..303824b --- /dev/null +++ b/sql/dump_reingest_quarterly.sql @@ -0,0 +1,78 @@ + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-%' + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-wayback-error' +) TO '/grande/snapshots/reingest_quarterly_spn2-error_current.rows.json'; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_cdx-error_current.rows.json'; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND (ingest_request.ingest_request_source != 'fatcat-changelog' + AND ingest_request.ingest_request_source != 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json'; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'wayback-error' + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL +) TO '/grande/snapshots/reingest_quarterly_wayback-error_current.rows.json'; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'gateway-timeout' + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_gateway-timeout.rows.json'; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'petabox-error' + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_petabox-error_current.rows.json'; + diff --git a/sql/dump_reingest.sql b/sql/dump_reingest_weekly.sql index 5ec2883..28547a4 100644 --- a/sql/dump_reingest.sql +++ b/sql/dump_reingest_weekly.sql @@ -4,12 +4,13 @@ COPY ( LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url WHERE ingest_request.ingest_type = 'pdf' AND ingest_file_result.ingest_type = 'pdf' - AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL AND ingest_file_result.hit = false AND ingest_file_result.status like 'spn2-%' AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' - AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-wayback-error' ) TO '/grande/snapshots/reingest_weekly_spn2-error_current.rows.json'; COPY ( @@ -19,7 +20,7 @@ COPY ( AND ingest_file_result.ingest_type = 'pdf' AND ingest_file_result.hit = false AND ingest_file_result.status like 'cdx-error' - AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') @@ -32,7 +33,7 @@ COPY ( AND ingest_file_result.ingest_type = 'pdf' AND ingest_file_result.hit = false AND ingest_file_result.status like 'cdx-error' - AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source != 'fatcat-changelog' AND ingest_request.ingest_request_source != 'fatcat-ingest') @@ -45,7 +46,7 @@ COPY ( AND ingest_file_result.ingest_type = 'pdf' AND ingest_file_result.hit = false AND ingest_file_result.status like 'wayback-error' - AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL ) TO '/grande/snapshots/reingest_weekly_wayback-error_current.rows.json'; @@ -56,7 +57,7 @@ COPY ( AND ingest_file_result.ingest_type = 'pdf' AND ingest_file_result.hit = false AND ingest_file_result.status like 'gateway-timeout' - AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') @@ -69,7 +70,7 @@ COPY ( AND ingest_file_result.ingest_type = 'pdf' AND ingest_file_result.hit = false AND ingest_file_result.status like 'petabox-error' - AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql index 7b5e823..fb4b0af 100644 --- a/sql/dump_unextracted_pdf.sql +++ b/sql/dump_unextracted_pdf.sql @@ -9,12 +9,14 @@ COPY ( FROM grobid LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex WHERE cdx.sha1hex IS NOT NULL --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL AND pdf_meta.sha1hex IS NULL ) -TO '/grande/snapshots/dump_unextracted_pdf.fatcat.2020-07-22.json' +TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json' WITH NULL ''; ROLLBACK; diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh new file mode 100755 index 0000000..44a22b3 --- /dev/null +++ b/sql/reingest_quarterly.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql + +cd ../python +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_spn2-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_wayback-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_quarterly_gateway-timeout.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_petabox-error_current.json + +cat /grande/snapshots/reingest_quarterly_spn2-error_current.json /grande/snapshots/reingest_quarterly_cdx-error_current.json /grande/snapshots/reingest_quarterly_wayback-error_current.json /grande/snapshots/reingest_quarterly_petabox-error_current.json /grande/snapshots/reingest_quarterly_gateway-timeout.json | shuf | head -n100000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +cat /grande/snapshots/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh index f501b20..dfd4869 100755 --- a/sql/reingest_weekly.sh +++ b/sql/reingest_weekly.sh @@ -1,20 +1,20 @@ -#!/usr/bin/bash +#!/bin/bash set -e # fail on error set -u # fail if variable not set in substitution set -o pipefail # fail if part of a '|' command fails -sudo -u postgres psql < dump_reingest.sql +sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql cd ../python -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json -cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 +cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | head -n40000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 -cat /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p $(RANDOM % 24) +cat /grande/snapshots/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 |