diff options
-rw-r--r-- | sql/dump_file_meta.sql | 2 | ||||
-rw-r--r-- | sql/dump_regrobid_pdf_petabox.sql | 2 | ||||
-rw-r--r-- | sql/dump_reingest_quarterly.sql | 12 | ||||
-rw-r--r-- | sql/dump_reingest_weekly.sql | 12 | ||||
-rw-r--r-- | sql/dump_unextracted_pdf.sql | 2 | ||||
-rw-r--r-- | sql/dump_unextracted_pdf_petabox.sql | 2 | ||||
-rw-r--r-- | sql/dump_ungrobid_pdf.sql | 2 | ||||
-rw-r--r-- | sql/dump_ungrobid_pdf_petabox.sql | 2 | ||||
-rw-r--r-- | sql/dump_unmatched_glutton_pdf.sql | 2 | ||||
-rw-r--r-- | sql/ingest_again.md | 28 | ||||
-rwxr-xr-x | sql/reingest_quarterly.sh | 16 | ||||
-rwxr-xr-x | sql/reingest_weekly.sh | 16 |
12 files changed, 49 insertions, 49 deletions
diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql index 1028c13..a7d6c2b 100644 --- a/sql/dump_file_meta.sql +++ b/sql/dump_file_meta.sql @@ -6,7 +6,7 @@ COPY ( FROM file_meta ORDER BY sha1hex ASC ) -TO '/grande/snapshots/file_meta_dump.tsv' +TO '/srv/sandcrawler/tasks/file_meta_dump.tsv' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql index 3ca8085..e7c48f3 100644 --- a/sql/dump_regrobid_pdf_petabox.sql +++ b/sql/dump_regrobid_pdf_petabox.sql @@ -9,7 +9,7 @@ COPY ( SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) ) -TO '/grande/snapshots/dump_regrobid_pdf_petabox.2020-02-03.json' +TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql index 066df9f..d73fe85 100644 --- a/sql/dump_reingest_quarterly.sql +++ b/sql/dump_reingest_quarterly.sql @@ -15,7 +15,7 @@ COPY ( AND ingest_file_result.status != 'spn2-error:too-many-redirects' AND ingest_file_result.status != 'spn2-error:network-authentication-required' AND ingest_file_result.status != 'spn2-wayback-error' -) TO '/grande/snapshots/reingest_quarterly_spn2-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -28,7 +28,7 @@ COPY ( AND ingest_request.created > NOW() - '91 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_cdx-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -41,7 +41,7 @@ COPY ( AND ingest_request.created > NOW() - '91 day'::INTERVAL AND (ingest_request.ingest_request_source != 'fatcat-changelog' AND ingest_request.ingest_request_source != 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -52,7 +52,7 @@ COPY ( AND ingest_file_result.status like 'wayback-error' AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '91 day'::INTERVAL -) TO '/grande/snapshots/reingest_quarterly_wayback-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -65,7 +65,7 @@ COPY ( AND ingest_request.created > NOW() - '91 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_gateway-timeout.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -78,5 +78,5 @@ COPY ( AND ingest_request.created > NOW() - '91 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_petabox-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.rows.json'; diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql index bc10b22..b49f1f1 100644 --- a/sql/dump_reingest_weekly.sql +++ b/sql/dump_reingest_weekly.sql @@ -15,7 +15,7 @@ COPY ( AND ingest_file_result.status != 'spn2-error:too-many-redirects' AND ingest_file_result.status != 'spn2-error:network-authentication-required' AND ingest_file_result.status != 'spn2-wayback-error' -) TO '/grande/snapshots/reingest_weekly_spn2-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -28,7 +28,7 @@ COPY ( AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_cdx-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -41,7 +41,7 @@ COPY ( AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source != 'fatcat-changelog' AND ingest_request.ingest_request_source != 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -52,7 +52,7 @@ COPY ( AND ingest_file_result.status like 'wayback-error' AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL -) TO '/grande/snapshots/reingest_weekly_wayback-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -65,7 +65,7 @@ COPY ( AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_gateway-timeout.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -78,5 +78,5 @@ COPY ( AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_petabox-error_current.rows.json'; +) TO '/srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.rows.json'; diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql index fb4b0af..a7fb920 100644 --- a/sql/dump_unextracted_pdf.sql +++ b/sql/dump_unextracted_pdf.sql @@ -16,7 +16,7 @@ COPY ( AND ingest_file_result.terminal_sha1hex IS NOT NULL AND pdf_meta.sha1hex IS NULL ) -TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json' +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql index 7db34fb..bb9f162 100644 --- a/sql/dump_unextracted_pdf_petabox.sql +++ b/sql/dump_unextracted_pdf_petabox.sql @@ -12,7 +12,7 @@ COPY ( WHERE petabox.sha1hex IS NOT NULL AND pdf_meta.sha1hex IS NULL ) -TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-07-22.json' +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql index e65edd5..81caf18 100644 --- a/sql/dump_ungrobid_pdf.sql +++ b/sql/dump_ungrobid_pdf.sql @@ -12,7 +12,7 @@ COPY ( -- uncomment/comment this to control whether only fatcat files are included --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex) ) -TO '/grande/snapshots/dump_ungrobided_pdf.fatcat.2020-08-04.json' +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql index f758ec2..b7a1db2 100644 --- a/sql/dump_ungrobid_pdf_petabox.sql +++ b/sql/dump_ungrobid_pdf_petabox.sql @@ -11,7 +11,7 @@ COPY ( -- uncomment/comment this to control whether only fatcat files are included AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex) ) -TO '/grande/snapshots/dump_ungrobided_pdf_petabox.2020-08-04.json' +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql index d089c7e..333ff7b 100644 --- a/sql/dump_unmatched_glutton_pdf.sql +++ b/sql/dump_unmatched_glutton_pdf.sql @@ -12,7 +12,7 @@ COPY ( AND grobid.fatcat_release IS NOT NULL LIMIT 1000 ) -TO '/grande/snapshots/dump_unmatched_glutton_pdf.2020-06-30.json'; +TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json'; --TO STDOUT --WITH NULL ''; diff --git a/sql/ingest_again.md b/sql/ingest_again.md index 3b4b990..b749557 100644 --- a/sql/ingest_again.md +++ b/sql/ingest_again.md @@ -12,7 +12,7 @@ AND ingest_file_result.status like 'spn2-%' AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' - ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -25,7 +25,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -38,7 +38,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source != 'fatcat-changelog' AND ingest_request.ingest_request_source != 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -49,7 +49,7 @@ AND ingest_file_result.status like 'wayback-error' AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL - ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -62,7 +62,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -75,16 +75,16 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json'; Transform: - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json Push to kafka (shuffled): @@ -122,10 +122,10 @@ Push to kafka (not shuffled): AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' AND ingest_request.ingest_request_source = 'fatcat-ingest' - ) TO '/grande/snapshots/reingest_fatcat_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json'; # note: shuf - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh index 8b0889d..8437647 100755 --- a/sql/reingest_quarterly.sh +++ b/sql/reingest_quarterly.sh @@ -7,14 +7,14 @@ set -o pipefail # fail if part of a '|' command fails sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql cd ../python -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_spn2-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_current.json -#pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_wayback-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_quarterly_gateway-timeout.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_petabox-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.json +#pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.json -cat /grande/snapshots/reingest_quarterly_spn2-error_current.json /grande/snapshots/reingest_quarterly_cdx-error_current.json /grande/snapshots/reingest_quarterly_wayback-error_current.json /grande/snapshots/reingest_quarterly_petabox-error_current.json /grande/snapshots/reingest_quarterly_gateway-timeout.json | shuf | head -n250000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 +cat /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.json | shuf | head -n250000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 -#cat /grande/snapshots/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +#cat /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh index 8710f76..96b4249 100755 --- a/sql/reingest_weekly.sh +++ b/sql/reingest_weekly.sh @@ -7,14 +7,14 @@ set -o pipefail # fail if part of a '|' command fails sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql cd ../python -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json -#pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.json +#pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.json +pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.json -cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | head -n60000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 +cat /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.json /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.json /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.json /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.json /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.json | shuf | head -n60000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 -#cat /grande/snapshots/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +#cat /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 |