aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
Diffstat (limited to 'sql')
-rw-r--r--sql/dump_file_meta.sql2
-rw-r--r--sql/dump_regrobid_pdf_petabox.sql2
-rw-r--r--sql/dump_reingest_quarterly.sql12
-rw-r--r--sql/dump_reingest_weekly.sql12
-rw-r--r--sql/dump_unextracted_pdf.sql2
-rw-r--r--sql/dump_unextracted_pdf_petabox.sql2
-rw-r--r--sql/dump_ungrobid_pdf.sql2
-rw-r--r--sql/dump_ungrobid_pdf_petabox.sql2
-rw-r--r--sql/dump_unmatched_glutton_pdf.sql2
-rw-r--r--sql/ingest_again.md28
-rwxr-xr-xsql/reingest_quarterly.sh16
-rwxr-xr-xsql/reingest_weekly.sh16
12 files changed, 49 insertions, 49 deletions
diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql
index 1028c13..a7d6c2b 100644
--- a/sql/dump_file_meta.sql
+++ b/sql/dump_file_meta.sql
@@ -6,7 +6,7 @@ COPY (
FROM file_meta
ORDER BY sha1hex ASC
)
-TO '/grande/snapshots/file_meta_dump.tsv'
+TO '/srv/sandcrawler/tasks/file_meta_dump.tsv'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql
index 3ca8085..e7c48f3 100644
--- a/sql/dump_regrobid_pdf_petabox.sql
+++ b/sql/dump_regrobid_pdf_petabox.sql
@@ -9,7 +9,7 @@ COPY (
SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox
WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
)
-TO '/grande/snapshots/dump_regrobid_pdf_petabox.2020-02-03.json'
+TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql
index 066df9f..d73fe85 100644
--- a/sql/dump_reingest_quarterly.sql
+++ b/sql/dump_reingest_quarterly.sql
@@ -15,7 +15,7 @@ COPY (
AND ingest_file_result.status != 'spn2-error:too-many-redirects'
AND ingest_file_result.status != 'spn2-error:network-authentication-required'
AND ingest_file_result.status != 'spn2-wayback-error'
-) TO '/grande/snapshots/reingest_quarterly_spn2-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -28,7 +28,7 @@ COPY (
AND ingest_request.created > NOW() - '91 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_cdx-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -41,7 +41,7 @@ COPY (
AND ingest_request.created > NOW() - '91 day'::INTERVAL
AND (ingest_request.ingest_request_source != 'fatcat-changelog'
AND ingest_request.ingest_request_source != 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -52,7 +52,7 @@ COPY (
AND ingest_file_result.status like 'wayback-error'
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '91 day'::INTERVAL
-) TO '/grande/snapshots/reingest_quarterly_wayback-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -65,7 +65,7 @@ COPY (
AND ingest_request.created > NOW() - '91 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_gateway-timeout.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -78,5 +78,5 @@ COPY (
AND ingest_request.created > NOW() - '91 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_petabox-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.rows.json';
diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql
index bc10b22..b49f1f1 100644
--- a/sql/dump_reingest_weekly.sql
+++ b/sql/dump_reingest_weekly.sql
@@ -15,7 +15,7 @@ COPY (
AND ingest_file_result.status != 'spn2-error:too-many-redirects'
AND ingest_file_result.status != 'spn2-error:network-authentication-required'
AND ingest_file_result.status != 'spn2-wayback-error'
-) TO '/grande/snapshots/reingest_weekly_spn2-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -28,7 +28,7 @@ COPY (
AND ingest_request.created > NOW() - '8 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_cdx-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -41,7 +41,7 @@ COPY (
AND ingest_request.created > NOW() - '8 day'::INTERVAL
AND (ingest_request.ingest_request_source != 'fatcat-changelog'
AND ingest_request.ingest_request_source != 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -52,7 +52,7 @@ COPY (
AND ingest_file_result.status like 'wayback-error'
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '8 day'::INTERVAL
-) TO '/grande/snapshots/reingest_weekly_wayback-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -65,7 +65,7 @@ COPY (
AND ingest_request.created > NOW() - '8 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_gateway-timeout.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -78,5 +78,5 @@ COPY (
AND ingest_request.created > NOW() - '8 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_petabox-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.rows.json';
diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql
index fb4b0af..a7fb920 100644
--- a/sql/dump_unextracted_pdf.sql
+++ b/sql/dump_unextracted_pdf.sql
@@ -16,7 +16,7 @@ COPY (
AND ingest_file_result.terminal_sha1hex IS NOT NULL
AND pdf_meta.sha1hex IS NULL
)
-TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql
index 7db34fb..bb9f162 100644
--- a/sql/dump_unextracted_pdf_petabox.sql
+++ b/sql/dump_unextracted_pdf_petabox.sql
@@ -12,7 +12,7 @@ COPY (
WHERE petabox.sha1hex IS NOT NULL
AND pdf_meta.sha1hex IS NULL
)
-TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-07-22.json'
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql
index e65edd5..81caf18 100644
--- a/sql/dump_ungrobid_pdf.sql
+++ b/sql/dump_ungrobid_pdf.sql
@@ -12,7 +12,7 @@ COPY (
-- uncomment/comment this to control whether only fatcat files are included
--AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
)
-TO '/grande/snapshots/dump_ungrobided_pdf.fatcat.2020-08-04.json'
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql
index f758ec2..b7a1db2 100644
--- a/sql/dump_ungrobid_pdf_petabox.sql
+++ b/sql/dump_ungrobid_pdf_petabox.sql
@@ -11,7 +11,7 @@ COPY (
-- uncomment/comment this to control whether only fatcat files are included
AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex)
)
-TO '/grande/snapshots/dump_ungrobided_pdf_petabox.2020-08-04.json'
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json'
WITH NULL '';
ROLLBACK;
diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql
index d089c7e..333ff7b 100644
--- a/sql/dump_unmatched_glutton_pdf.sql
+++ b/sql/dump_unmatched_glutton_pdf.sql
@@ -12,7 +12,7 @@ COPY (
AND grobid.fatcat_release IS NOT NULL
LIMIT 1000
)
-TO '/grande/snapshots/dump_unmatched_glutton_pdf.2020-06-30.json';
+TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json';
--TO STDOUT
--WITH NULL '';
diff --git a/sql/ingest_again.md b/sql/ingest_again.md
index 3b4b990..b749557 100644
--- a/sql/ingest_again.md
+++ b/sql/ingest_again.md
@@ -12,7 +12,7 @@
AND ingest_file_result.status like 'spn2-%'
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
- ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -25,7 +25,7 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -38,7 +38,7 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source != 'fatcat-changelog'
AND ingest_request.ingest_request_source != 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -49,7 +49,7 @@
AND ingest_file_result.status like 'wayback-error'
AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
- ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -62,7 +62,7 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json';
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -75,16 +75,16 @@
AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
OR ingest_request.ingest_request_source = 'fatcat-ingest')
- ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json';
Transform:
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
Push to kafka (shuffled):
@@ -122,10 +122,10 @@ Push to kafka (not shuffled):
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
AND ingest_request.ingest_request_source = 'fatcat-ingest'
- ) TO '/grande/snapshots/reingest_fatcat_current.rows.json';
+ ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json';
# note: shuf
- ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
index 8b0889d..8437647 100755
--- a/sql/reingest_quarterly.sh
+++ b/sql/reingest_quarterly.sh
@@ -7,14 +7,14 @@ set -o pipefail # fail if part of a '|' command fails
sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql
cd ../python
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_spn2-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_current.json
-#pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_wayback-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_quarterly_gateway-timeout.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_petabox-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.json
+#pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.json
-cat /grande/snapshots/reingest_quarterly_spn2-error_current.json /grande/snapshots/reingest_quarterly_cdx-error_current.json /grande/snapshots/reingest_quarterly_wayback-error_current.json /grande/snapshots/reingest_quarterly_petabox-error_current.json /grande/snapshots/reingest_quarterly_gateway-timeout.json | shuf | head -n250000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+cat /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.json | shuf | head -n250000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
-#cat /grande/snapshots/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+#cat /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
index 8710f76..96b4249 100755
--- a/sql/reingest_weekly.sh
+++ b/sql/reingest_weekly.sh
@@ -7,14 +7,14 @@ set -o pipefail # fail if part of a '|' command fails
sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql
cd ../python
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json
-#pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.json
+#pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.json
-cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | head -n60000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+cat /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.json /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.json /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.json /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.json /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.json | shuf | head -n60000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
-#cat /grande/snapshots/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+#cat /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1