From aa1b437a629701db62a968a38b9d9764e7f912c2 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 9 Apr 2021 13:07:21 -0700
Subject: sql: update paths to work with svc506 machine

---
 sql/dump_file_meta.sql               |  2 +-
 sql/dump_regrobid_pdf_petabox.sql    |  2 +-
 sql/dump_reingest_quarterly.sql      | 12 ++++++------
 sql/dump_reingest_weekly.sql         | 12 ++++++------
 sql/dump_unextracted_pdf.sql         |  2 +-
 sql/dump_unextracted_pdf_petabox.sql |  2 +-
 sql/dump_ungrobid_pdf.sql            |  2 +-
 sql/dump_ungrobid_pdf_petabox.sql    |  2 +-
 sql/dump_unmatched_glutton_pdf.sql   |  2 +-
 sql/ingest_again.md                  | 28 ++++++++++++++--------------
 sql/reingest_quarterly.sh            | 16 ++++++++--------
 sql/reingest_weekly.sh               | 16 ++++++++--------
 12 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'sql')

diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql
index 1028c13..a7d6c2b 100644
--- a/sql/dump_file_meta.sql
+++ b/sql/dump_file_meta.sql
@@ -6,7 +6,7 @@ COPY (
   FROM file_meta
   ORDER BY sha1hex ASC
 )
-TO '/grande/snapshots/file_meta_dump.tsv'
+TO '/srv/sandcrawler/tasks/file_meta_dump.tsv'
 WITH NULL '';
 
 ROLLBACK;
diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql
index 3ca8085..e7c48f3 100644
--- a/sql/dump_regrobid_pdf_petabox.sql
+++ b/sql/dump_regrobid_pdf_petabox.sql
@@ -9,7 +9,7 @@ COPY (
     SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox
     WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
 )
-TO '/grande/snapshots/dump_regrobid_pdf_petabox.2020-02-03.json'
+TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json'
 WITH NULL '';
 
 ROLLBACK;
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql
index 066df9f..d73fe85 100644
--- a/sql/dump_reingest_quarterly.sql
+++ b/sql/dump_reingest_quarterly.sql
@@ -15,7 +15,7 @@ COPY (
         AND ingest_file_result.status != 'spn2-error:too-many-redirects'
         AND ingest_file_result.status != 'spn2-error:network-authentication-required'
         AND ingest_file_result.status != 'spn2-wayback-error'
-) TO '/grande/snapshots/reingest_quarterly_spn2-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -28,7 +28,7 @@ COPY (
         AND ingest_request.created > NOW() - '91 day'::INTERVAL
         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
              OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_cdx-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -41,7 +41,7 @@ COPY (
         AND ingest_request.created > NOW() - '91 day'::INTERVAL
         AND (ingest_request.ingest_request_source != 'fatcat-changelog'
              AND ingest_request.ingest_request_source != 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -52,7 +52,7 @@ COPY (
         AND ingest_file_result.status like 'wayback-error'
         AND ingest_request.created < NOW() - '8 hour'::INTERVAL
         AND ingest_request.created > NOW() - '91 day'::INTERVAL
-) TO '/grande/snapshots/reingest_quarterly_wayback-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -65,7 +65,7 @@ COPY (
         AND ingest_request.created > NOW() - '91 day'::INTERVAL
         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
              OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_gateway-timeout.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -78,5 +78,5 @@ COPY (
         AND ingest_request.created > NOW() - '91 day'::INTERVAL
         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
              OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_quarterly_petabox-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.rows.json';
 
diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql
index bc10b22..b49f1f1 100644
--- a/sql/dump_reingest_weekly.sql
+++ b/sql/dump_reingest_weekly.sql
@@ -15,7 +15,7 @@ COPY (
         AND ingest_file_result.status != 'spn2-error:too-many-redirects'
         AND ingest_file_result.status != 'spn2-error:network-authentication-required'
         AND ingest_file_result.status != 'spn2-wayback-error'
-) TO '/grande/snapshots/reingest_weekly_spn2-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -28,7 +28,7 @@ COPY (
         AND ingest_request.created > NOW() - '8 day'::INTERVAL
         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
              OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_cdx-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -41,7 +41,7 @@ COPY (
         AND ingest_request.created > NOW() - '8 day'::INTERVAL
         AND (ingest_request.ingest_request_source != 'fatcat-changelog'
              AND ingest_request.ingest_request_source != 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -52,7 +52,7 @@ COPY (
         AND ingest_file_result.status like 'wayback-error'
         AND ingest_request.created < NOW() - '8 hour'::INTERVAL
         AND ingest_request.created > NOW() - '8 day'::INTERVAL
-) TO '/grande/snapshots/reingest_weekly_wayback-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -65,7 +65,7 @@ COPY (
         AND ingest_request.created > NOW() - '8 day'::INTERVAL
         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
              OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_gateway-timeout.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.rows.json';
 
 COPY (
     SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -78,5 +78,5 @@ COPY (
         AND ingest_request.created > NOW() - '8 day'::INTERVAL
         AND (ingest_request.ingest_request_source = 'fatcat-changelog'
              OR ingest_request.ingest_request_source = 'fatcat-ingest')
-) TO '/grande/snapshots/reingest_weekly_petabox-error_current.rows.json';
+) TO '/srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.rows.json';
 
diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql
index fb4b0af..a7fb920 100644
--- a/sql/dump_unextracted_pdf.sql
+++ b/sql/dump_unextracted_pdf.sql
@@ -16,7 +16,7 @@ COPY (
     AND ingest_file_result.terminal_sha1hex IS NOT NULL
     AND pdf_meta.sha1hex IS NULL
 )
-TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json'
 WITH NULL '';
 
 ROLLBACK;
diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql
index 7db34fb..bb9f162 100644
--- a/sql/dump_unextracted_pdf_petabox.sql
+++ b/sql/dump_unextracted_pdf_petabox.sql
@@ -12,7 +12,7 @@ COPY (
   WHERE petabox.sha1hex IS NOT NULL
     AND pdf_meta.sha1hex IS NULL
 )
-TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-07-22.json'
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json'
 WITH NULL '';
 
 ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql
index e65edd5..81caf18 100644
--- a/sql/dump_ungrobid_pdf.sql
+++ b/sql/dump_ungrobid_pdf.sql
@@ -12,7 +12,7 @@ COPY (
   -- uncomment/comment this to control whether only fatcat files are included
   --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
 )
-TO '/grande/snapshots/dump_ungrobided_pdf.fatcat.2020-08-04.json'
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json'
 WITH NULL '';
 
 ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql
index f758ec2..b7a1db2 100644
--- a/sql/dump_ungrobid_pdf_petabox.sql
+++ b/sql/dump_ungrobid_pdf_petabox.sql
@@ -11,7 +11,7 @@ COPY (
   -- uncomment/comment this to control whether only fatcat files are included
   AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex)
 )
-TO '/grande/snapshots/dump_ungrobided_pdf_petabox.2020-08-04.json'
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json'
 WITH NULL '';
 
 ROLLBACK;
diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql
index d089c7e..333ff7b 100644
--- a/sql/dump_unmatched_glutton_pdf.sql
+++ b/sql/dump_unmatched_glutton_pdf.sql
@@ -12,7 +12,7 @@ COPY (
   AND grobid.fatcat_release IS NOT NULL
   LIMIT 1000
 )
-TO '/grande/snapshots/dump_unmatched_glutton_pdf.2020-06-30.json';
+TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json';
 --TO STDOUT
 --WITH NULL '';
 
diff --git a/sql/ingest_again.md b/sql/ingest_again.md
index 3b4b990..b749557 100644
--- a/sql/ingest_again.md
+++ b/sql/ingest_again.md
@@ -12,7 +12,7 @@
             AND ingest_file_result.status like 'spn2-%'
             AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
             AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
-    ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json';
+    ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json';
 
     COPY (
         SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -25,7 +25,7 @@
             AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
             AND (ingest_request.ingest_request_source = 'fatcat-changelog'
                  OR ingest_request.ingest_request_source = 'fatcat-ingest')
-    ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json';
+    ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json';
 
     COPY (
         SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -38,7 +38,7 @@
             AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
             AND (ingest_request.ingest_request_source != 'fatcat-changelog'
                  AND ingest_request.ingest_request_source != 'fatcat-ingest')
-    ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json';
+    ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json';
 
     COPY (
         SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -49,7 +49,7 @@
             AND ingest_file_result.status like 'wayback-error'
             AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
             AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
-    ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json';
+    ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json';
 
     COPY (
         SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -62,7 +62,7 @@
             AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
             AND (ingest_request.ingest_request_source = 'fatcat-changelog'
                  OR ingest_request.ingest_request_source = 'fatcat-ingest')
-    ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json';
+    ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json';
 
     COPY (
         SELECT row_to_json(ingest_request.*) FROM ingest_request
@@ -75,16 +75,16 @@
             AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
             AND (ingest_request.ingest_request_source = 'fatcat-changelog'
                  OR ingest_request.ingest_request_source = 'fatcat-ingest')
-    ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json';
+    ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json';
 
 Transform:
 
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
 
 Push to kafka (shuffled):
 
@@ -122,10 +122,10 @@ Push to kafka (not shuffled):
             AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
             AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
             AND ingest_request.ingest_request_source = 'fatcat-ingest'
-    ) TO '/grande/snapshots/reingest_fatcat_current.rows.json';
+    ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json';
 
     # note: shuf
-    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
 
     cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
 
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
index 8b0889d..8437647 100755
--- a/sql/reingest_quarterly.sh
+++ b/sql/reingest_quarterly.sh
@@ -7,14 +7,14 @@ set -o pipefail     # fail if part of a '|' command fails
 sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql
 
 cd ../python
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_spn2-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_current.json
-#pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_wayback-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_quarterly_gateway-timeout.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_petabox-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.json
+#pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.json
 
-cat /grande/snapshots/reingest_quarterly_spn2-error_current.json /grande/snapshots/reingest_quarterly_cdx-error_current.json /grande/snapshots/reingest_quarterly_wayback-error_current.json /grande/snapshots/reingest_quarterly_petabox-error_current.json /grande/snapshots/reingest_quarterly_gateway-timeout.json | shuf | head -n250000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+cat /srv/sandcrawler/tasks/reingest_quarterly_spn2-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_wayback-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_petabox-error_current.json /srv/sandcrawler/tasks/reingest_quarterly_gateway-timeout.json | shuf | head -n250000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
 
-#cat /grande/snapshots/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+#cat /srv/sandcrawler/tasks/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
 
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
index 8710f76..96b4249 100755
--- a/sql/reingest_weekly.sh
+++ b/sql/reingest_weekly.sh
@@ -7,14 +7,14 @@ set -o pipefail     # fail if part of a '|' command fails
 sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql
 
 cd ../python
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json
-#pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json
-pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.json
+#pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.json
+pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.rows.json | shuf > /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.json
 
-cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | head -n60000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+cat /srv/sandcrawler/tasks/reingest_weekly_spn2-error_current.json /srv/sandcrawler/tasks/reingest_weekly_cdx-error_current.json /srv/sandcrawler/tasks/reingest_weekly_wayback-error_current.json /srv/sandcrawler/tasks/reingest_weekly_petabox-error_current.json /srv/sandcrawler/tasks/reingest_weekly_gateway-timeout.json | shuf | head -n60000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
 
-#cat /grande/snapshots/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+#cat /srv/sandcrawler/tasks/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
 
-- 
cgit v1.2.3