From 00ae74378413e87f230c88113ff8163a6f969d63 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 3 May 2022 17:12:48 -0700
Subject: switch default kafka-broker host from wbgrp-svc263 to wbgrp-svc350

---
 RUNBOOK.md                | 4 ++--
 blobs/tasks.md            | 4 ++--
 python/grobid_tool.py     | 2 +-
 python/pdftrio_tool.py    | 2 +-
 python_hadoop/README.md   | 8 ++++----
 sql/reingest_bulk.sh      | 2 +-
 sql/reingest_quarterly.sh | 2 +-
 sql/reingest_spn.sh       | 2 +-
 sql/reingest_weekly.sh    | 2 +-
 9 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/RUNBOOK.md b/RUNBOOK.md
index 33d4711..6c4165d 100644
--- a/RUNBOOK.md
+++ b/RUNBOOK.md
@@ -23,7 +23,7 @@ Copy/transfer to a Kafka node; load a sample and then the whole output:
 
 Older example; if this fails, need to re-run entire thing:
 
-    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
 
 TODO: is it possible to use job log with millions of `--pipe` inputs? That
 would be more efficient in the event of failure.
@@ -35,7 +35,7 @@ Want to use GNU/Parallel in a mode that will do retries well:
     fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
         sort | \
         parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
-        './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+        './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
 
 After starting, check that messages are actually getting pushed to kafka
 (producer failures can be silent!). If anything goes wrong, run the exact same
diff --git a/blobs/tasks.md b/blobs/tasks.md
index 34dec8f..beb765f 100644
--- a/blobs/tasks.md
+++ b/blobs/tasks.md
@@ -19,7 +19,7 @@ didn't try to connect to postgresql.
 
 Commands:
 
-    ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+    ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
     => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed
     => run briefly, then kill
 
@@ -29,7 +29,7 @@ On kafka-broker worker:
 
 Then run 2x instances of worker (same command as above):
 
-    ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+    ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
 
 At this point CPU-limited on this worker by the python processes (only 4 cores
 on this machine).
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 029cbf1..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -5,7 +5,7 @@ might go to stdout, or might go to Kafka topic.
 
 Example of large parallel run, locally:
 
-    cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json         | pv -l | parallel -j30 --pipe         ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+    cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json         | pv -l | parallel -j30 --pipe         ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
 """
 
 import argparse
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 9d3010e..24b749d 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -5,7 +5,7 @@ text extraction.
 
 Example of large parallel run, locally:
 
-cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
 """
 
 import argparse
diff --git a/python_hadoop/README.md b/python_hadoop/README.md
index 198c949..7866480 100644
--- a/python_hadoop/README.md
+++ b/python_hadoop/README.md
@@ -68,7 +68,7 @@ running on a devbox and GROBID running on a dedicated machine:
 
     ./extraction_cdx_grobid.py \
         --hbase-table wbgrp-journal-extract-0-qa \
-        --hbase-host wbgrp-svc263.us.archive.org \
+        --hbase-host wbgrp-svc350.us.archive.org \
         --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
         tests/files/example.cdx
 
@@ -76,7 +76,7 @@ Running from the cluster (once a ./venv-current.tar.gz tarball exists):
 
     ./extraction_cdx_grobid.py \
         --hbase-table wbgrp-journal-extract-0-qa \
-        --hbase-host wbgrp-svc263.us.archive.org \
+        --hbase-host wbgrp-svc350.us.archive.org \
         --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
         -r hadoop \
         -c mrjob.conf \
@@ -90,13 +90,13 @@ running on a devbox:
 
     ./backfill_hbase_from_cdx.py \
         --hbase-table wbgrp-journal-extract-0-qa \
-        --hbase-host wbgrp-svc263.us.archive.org \
+        --hbase-host wbgrp-svc350.us.archive.org \
         tests/files/example.cdx
 
 Running from the cluster (once a ./venv-current.tar.gz tarball exists):
 
     ./backfill_hbase_from_cdx.py \
-        --hbase-host wbgrp-svc263.us.archive.org \
+        --hbase-host wbgrp-svc350.us.archive.org \
         --hbase-table wbgrp-journal-extract-0-qa \
         -r hadoop \
         -c mrjob.conf \
diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh
index d5d3e35..d39a171 100755
--- a/sql/reingest_bulk.sh
+++ b/sql/reingest_bulk.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_bulk_current.json \
     | shuf \
     | head -n1000000 \
     | jq . -c \
-    | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
 
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
index 20fd82b..8a2996c 100755
--- a/sql/reingest_quarterly.sh
+++ b/sql/reingest_quarterly.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
     | shuf \
     | head -n120000 \
     | jq . -c \
-    | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
 
diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh
index 6fb1e4b..c693a64 100755
--- a/sql/reingest_spn.sh
+++ b/sql/reingest_spn.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_spn.json \
     | shuf \
     | head -n60000 \
     | jq . -c \
-    | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
 
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
index b60bd0e..d2e2444 100755
--- a/sql/reingest_weekly.sh
+++ b/sql/reingest_weekly.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
     | shuf \
     | head -n80000 \
     | jq . -c \
-    | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
 
-- 
cgit v1.2.3