diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:38:09 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:38:09 -0700 | 
| commit | 4a9fba8005e0a65c03198c674d2c65f7440d71a6 (patch) | |
| tree | d424ca51049632386aaf5762c2b45685d304cd1f | |
| parent | f1936476985231286ad1abc74318cc06e20e2627 (diff) | |
| download | sandcrawler-4a9fba8005e0a65c03198c674d2c65f7440d71a6.tar.gz sandcrawler-4a9fba8005e0a65c03198c674d2c65f7440d71a6.zip  | |
SQL: update weekly/quarterly ingest retry scripts
| -rw-r--r-- | sql/dump_reingest_quarterly.sql | 78 | ||||
| -rw-r--r-- | sql/dump_reingest_weekly.sql (renamed from sql/dump_reingest.sql) | 15 | ||||
| -rw-r--r-- | sql/dump_unextracted_pdf.sql | 4 | ||||
| -rwxr-xr-x | sql/reingest_quarterly.sh | 20 | ||||
| -rwxr-xr-x | sql/reingest_weekly.sh | 20 | 
5 files changed, 119 insertions, 18 deletions
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql new file mode 100644 index 0000000..303824b --- /dev/null +++ b/sql/dump_reingest_quarterly.sql @@ -0,0 +1,78 @@ + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.ingest_type = 'pdf' +        AND ingest_file_result.ingest_type = 'pdf' +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL +        AND ingest_request.created > NOW() - '91 day'::INTERVAL +        AND ingest_file_result.hit = false +        AND ingest_file_result.status like 'spn2-%' +        AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' +        AND ingest_file_result.status != 'spn2-error:filesize-limit' +        AND ingest_file_result.status != 'spn2-wayback-error' +) TO '/grande/snapshots/reingest_quarterly_spn2-error_current.rows.json'; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.ingest_type = 'pdf' +        AND ingest_file_result.ingest_type = 'pdf' +        AND ingest_file_result.hit = false +        AND ingest_file_result.status like 'cdx-error' +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL +        AND ingest_request.created > NOW() - '91 day'::INTERVAL +        AND (ingest_request.ingest_request_source = 'fatcat-changelog' +             OR ingest_request.ingest_request_source = 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_cdx-error_current.rows.json'; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.ingest_type = 'pdf' +        AND ingest_file_result.ingest_type = 'pdf' +        AND ingest_file_result.hit = false +        AND ingest_file_result.status like 'cdx-error' +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL +        AND ingest_request.created > NOW() - '91 day'::INTERVAL +        AND (ingest_request.ingest_request_source != 'fatcat-changelog' +             AND ingest_request.ingest_request_source != 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json'; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.ingest_type = 'pdf' +        AND ingest_file_result.ingest_type = 'pdf' +        AND ingest_file_result.hit = false +        AND ingest_file_result.status like 'wayback-error' +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL +        AND ingest_request.created > NOW() - '91 day'::INTERVAL +) TO '/grande/snapshots/reingest_quarterly_wayback-error_current.rows.json'; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.ingest_type = 'pdf' +        AND ingest_file_result.ingest_type = 'pdf' +        AND ingest_file_result.hit = false +        AND ingest_file_result.status like 'gateway-timeout' +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL +        AND ingest_request.created > NOW() - '91 day'::INTERVAL +        AND (ingest_request.ingest_request_source = 'fatcat-changelog' +             OR ingest_request.ingest_request_source = 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_gateway-timeout.rows.json'; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.ingest_type = 'pdf' +        AND ingest_file_result.ingest_type = 'pdf' +        AND ingest_file_result.hit = false +        AND ingest_file_result.status like 'petabox-error' +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL +        AND ingest_request.created > NOW() - '91 day'::INTERVAL +        AND (ingest_request.ingest_request_source = 'fatcat-changelog' +             OR ingest_request.ingest_request_source = 'fatcat-ingest') +) TO '/grande/snapshots/reingest_quarterly_petabox-error_current.rows.json'; + diff --git a/sql/dump_reingest.sql b/sql/dump_reingest_weekly.sql index 5ec2883..28547a4 100644 --- a/sql/dump_reingest.sql +++ b/sql/dump_reingest_weekly.sql @@ -4,12 +4,13 @@ COPY (      LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url      WHERE ingest_request.ingest_type = 'pdf'          AND ingest_file_result.ingest_type = 'pdf' -        AND ingest_request.created < NOW() - '6 hour'::INTERVAL +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '8 day'::INTERVAL          AND ingest_file_result.hit = false          AND ingest_file_result.status like 'spn2-%'          AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' -        AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' +        AND ingest_file_result.status != 'spn2-error:filesize-limit' +        AND ingest_file_result.status != 'spn2-wayback-error'  ) TO '/grande/snapshots/reingest_weekly_spn2-error_current.rows.json';  COPY ( @@ -19,7 +20,7 @@ COPY (          AND ingest_file_result.ingest_type = 'pdf'          AND ingest_file_result.hit = false          AND ingest_file_result.status like 'cdx-error' -        AND ingest_request.created < NOW() - '6 hour'::INTERVAL +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '8 day'::INTERVAL          AND (ingest_request.ingest_request_source = 'fatcat-changelog'               OR ingest_request.ingest_request_source = 'fatcat-ingest') @@ -32,7 +33,7 @@ COPY (          AND ingest_file_result.ingest_type = 'pdf'          AND ingest_file_result.hit = false          AND ingest_file_result.status like 'cdx-error' -        AND ingest_request.created < NOW() - '6 hour'::INTERVAL +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '8 day'::INTERVAL          AND (ingest_request.ingest_request_source != 'fatcat-changelog'               AND ingest_request.ingest_request_source != 'fatcat-ingest') @@ -45,7 +46,7 @@ COPY (          AND ingest_file_result.ingest_type = 'pdf'          AND ingest_file_result.hit = false          AND ingest_file_result.status like 'wayback-error' -        AND ingest_request.created < NOW() - '6 hour'::INTERVAL +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '8 day'::INTERVAL  ) TO '/grande/snapshots/reingest_weekly_wayback-error_current.rows.json'; @@ -56,7 +57,7 @@ COPY (          AND ingest_file_result.ingest_type = 'pdf'          AND ingest_file_result.hit = false          AND ingest_file_result.status like 'gateway-timeout' -        AND ingest_request.created < NOW() - '6 hour'::INTERVAL +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '8 day'::INTERVAL          AND (ingest_request.ingest_request_source = 'fatcat-changelog'               OR ingest_request.ingest_request_source = 'fatcat-ingest') @@ -69,7 +70,7 @@ COPY (          AND ingest_file_result.ingest_type = 'pdf'          AND ingest_file_result.hit = false          AND ingest_file_result.status like 'petabox-error' -        AND ingest_request.created < NOW() - '6 hour'::INTERVAL +        AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '8 day'::INTERVAL          AND (ingest_request.ingest_request_source = 'fatcat-changelog'               OR ingest_request.ingest_request_source = 'fatcat-ingest') diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql index 7b5e823..fb4b0af 100644 --- a/sql/dump_unextracted_pdf.sql +++ b/sql/dump_unextracted_pdf.sql @@ -9,12 +9,14 @@ COPY (    FROM grobid    LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex    --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex +  LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex    LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex    WHERE cdx.sha1hex IS NOT NULL      --AND fatcat_file.sha1hex IS NOT NULL +    AND ingest_file_result.terminal_sha1hex IS NOT NULL      AND pdf_meta.sha1hex IS NULL  ) -TO '/grande/snapshots/dump_unextracted_pdf.fatcat.2020-07-22.json' +TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'  WITH NULL '';  ROLLBACK; diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh new file mode 100755 index 0000000..44a22b3 --- /dev/null +++ b/sql/reingest_quarterly.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e              # fail on error +set -u              # fail if variable not set in substitution +set -o pipefail     # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql + +cd ../python +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_spn2-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_wayback-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_quarterly_gateway-timeout.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_petabox-error_current.json + +cat /grande/snapshots/reingest_quarterly_spn2-error_current.json /grande/snapshots/reingest_quarterly_cdx-error_current.json /grande/snapshots/reingest_quarterly_wayback-error_current.json /grande/snapshots/reingest_quarterly_petabox-error_current.json /grande/snapshots/reingest_quarterly_gateway-timeout.json | shuf | head -n100000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +cat /grande/snapshots/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh index f501b20..dfd4869 100755 --- a/sql/reingest_weekly.sh +++ b/sql/reingest_weekly.sh @@ -1,20 +1,20 @@ -#!/usr/bin/bash +#!/bin/bash  set -e              # fail on error  set -u              # fail if variable not set in substitution  set -o pipefail     # fail if part of a '|' command fails -sudo -u postgres psql < dump_reingest.sql +sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql  cd ../python -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots//grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json +pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json -cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 +cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | head -n40000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 -cat /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p $(RANDOM % 24) +cat /grande/snapshots/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1  | 
