aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-05-03 17:12:48 -0700
committerBryan Newbold <bnewbold@archive.org>2022-05-03 17:12:48 -0700
commit00ae74378413e87f230c88113ff8163a6f969d63 (patch)
tree16cdcbde7a002704e80f494b7fd13fc5c19dd695
parentef0421567dd67a248d0f92f32ad4e14ae0776920 (diff)
downloadsandcrawler-00ae74378413e87f230c88113ff8163a6f969d63.tar.gz
sandcrawler-00ae74378413e87f230c88113ff8163a6f969d63.zip
switch default kafka-broker host from wbgrp-svc263 to wbgrp-svc350
-rw-r--r--RUNBOOK.md4
-rw-r--r--blobs/tasks.md4
-rwxr-xr-xpython/grobid_tool.py2
-rwxr-xr-xpython/pdftrio_tool.py2
-rw-r--r--python_hadoop/README.md8
-rwxr-xr-xsql/reingest_bulk.sh2
-rwxr-xr-xsql/reingest_quarterly.sh2
-rwxr-xr-xsql/reingest_spn.sh2
-rwxr-xr-xsql/reingest_weekly.sh2
9 files changed, 14 insertions, 14 deletions
diff --git a/RUNBOOK.md b/RUNBOOK.md
index 33d4711..6c4165d 100644
--- a/RUNBOOK.md
+++ b/RUNBOOK.md
@@ -23,7 +23,7 @@ Copy/transfer to a Kafka node; load a sample and then the whole output:
Older example; if this fails, need to re-run entire thing:
- cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
TODO: is it possible to use job log with millions of `--pipe` inputs? That
would be more efficient in the event of failure.
@@ -35,7 +35,7 @@ Want to use GNU/Parallel in a mode that will do retries well:
fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
sort | \
parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
- './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
After starting, check that messages are actually getting pushed to kafka
(producer failures can be silent!). If anything goes wrong, run the exact same
diff --git a/blobs/tasks.md b/blobs/tasks.md
index 34dec8f..beb765f 100644
--- a/blobs/tasks.md
+++ b/blobs/tasks.md
@@ -19,7 +19,7 @@ didn't try to connect to postgresql.
Commands:
- ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
=> Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed
=> run briefly, then kill
@@ -29,7 +29,7 @@ On kafka-broker worker:
Then run 2x instances of worker (same command as above):
- ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
At this point CPU-limited on this worker by the python processes (only 4 cores
on this machine).
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 029cbf1..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -5,7 +5,7 @@ might go to stdout, or might go to Kafka topic.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
import argparse
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 9d3010e..24b749d 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -5,7 +5,7 @@ text extraction.
Example of large parallel run, locally:
-cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
"""
import argparse
diff --git a/python_hadoop/README.md b/python_hadoop/README.md
index 198c949..7866480 100644
--- a/python_hadoop/README.md
+++ b/python_hadoop/README.md
@@ -68,7 +68,7 @@ running on a devbox and GROBID running on a dedicated machine:
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
tests/files/example.cdx
@@ -76,7 +76,7 @@ Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
-r hadoop \
-c mrjob.conf \
@@ -90,13 +90,13 @@ running on a devbox:
./backfill_hbase_from_cdx.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
tests/files/example.cdx
Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./backfill_hbase_from_cdx.py \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--hbase-table wbgrp-journal-extract-0-qa \
-r hadoop \
-c mrjob.conf \
diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh
index d5d3e35..d39a171 100755
--- a/sql/reingest_bulk.sh
+++ b/sql/reingest_bulk.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_bulk_current.json \
| shuf \
| head -n1000000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
index 20fd82b..8a2996c 100755
--- a/sql/reingest_quarterly.sh
+++ b/sql/reingest_quarterly.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
| shuf \
| head -n120000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh
index 6fb1e4b..c693a64 100755
--- a/sql/reingest_spn.sh
+++ b/sql/reingest_spn.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_spn.json \
| shuf \
| head -n60000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
index b60bd0e..d2e2444 100755
--- a/sql/reingest_weekly.sh
+++ b/sql/reingest_weekly.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
| shuf \
| head -n80000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1