diff options
-rw-r--r-- | RUNBOOK.md | 4 | ||||
-rw-r--r-- | blobs/tasks.md | 4 | ||||
-rwxr-xr-x | python/grobid_tool.py | 2 | ||||
-rwxr-xr-x | python/pdftrio_tool.py | 2 | ||||
-rw-r--r-- | python_hadoop/README.md | 8 | ||||
-rwxr-xr-x | sql/reingest_bulk.sh | 2 | ||||
-rwxr-xr-x | sql/reingest_quarterly.sh | 2 | ||||
-rwxr-xr-x | sql/reingest_spn.sh | 2 | ||||
-rwxr-xr-x | sql/reingest_weekly.sh | 2 |
9 files changed, 14 insertions, 14 deletions
@@ -23,7 +23,7 @@ Copy/transfer to a Kafka node; load a sample and then the whole output: Older example; if this fails, need to re-run entire thing: - cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - + cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - TODO: is it possible to use job log with millions of `--pipe` inputs? That would be more efficient in the event of failure. @@ -35,7 +35,7 @@ Want to use GNU/Parallel in a mode that will do retries well: fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \ sort | \ parallel -j16 --progress --joblog extract_tasks.log --resume-failed \ - './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' + './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' After starting, check that messages are actually getting pushed to kafka (producer failures can be silent!). If anything goes wrong, run the exact same diff --git a/blobs/tasks.md b/blobs/tasks.md index 34dec8f..beb765f 100644 --- a/blobs/tasks.md +++ b/blobs/tasks.md @@ -19,7 +19,7 @@ didn't try to connect to postgresql. Commands: - ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed => run briefly, then kill @@ -29,7 +29,7 @@ On kafka-broker worker: Then run 2x instances of worker (same command as above): - ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only At this point CPU-limited on this worker by the python processes (only 4 cores on this machine). diff --git a/python/grobid_tool.py b/python/grobid_tool.py index 029cbf1..3ffac98 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -5,7 +5,7 @@ might go to stdout, or might go to Kafka topic. Example of large parallel run, locally: - cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - + cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - """ import argparse diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py index 9d3010e..24b749d 100755 --- a/python/pdftrio_tool.py +++ b/python/pdftrio_tool.py @@ -5,7 +5,7 @@ text extraction. Example of large parallel run, locally: -cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json - +cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json - """ import argparse diff --git a/python_hadoop/README.md b/python_hadoop/README.md index 198c949..7866480 100644 --- a/python_hadoop/README.md +++ b/python_hadoop/README.md @@ -68,7 +68,7 @@ running on a devbox and GROBID running on a dedicated machine: ./extraction_cdx_grobid.py \ --hbase-table wbgrp-journal-extract-0-qa \ - --hbase-host wbgrp-svc263.us.archive.org \ + --hbase-host wbgrp-svc350.us.archive.org \ --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \ tests/files/example.cdx @@ -76,7 +76,7 @@ Running from the cluster (once a ./venv-current.tar.gz tarball exists): ./extraction_cdx_grobid.py \ --hbase-table wbgrp-journal-extract-0-qa \ - --hbase-host wbgrp-svc263.us.archive.org \ + --hbase-host wbgrp-svc350.us.archive.org \ --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \ -r hadoop \ -c mrjob.conf \ @@ -90,13 +90,13 @@ running on a devbox: ./backfill_hbase_from_cdx.py \ --hbase-table wbgrp-journal-extract-0-qa \ - --hbase-host wbgrp-svc263.us.archive.org \ + --hbase-host wbgrp-svc350.us.archive.org \ tests/files/example.cdx Running from the cluster (once a ./venv-current.tar.gz tarball exists): ./backfill_hbase_from_cdx.py \ - --hbase-host wbgrp-svc263.us.archive.org \ + --hbase-host wbgrp-svc350.us.archive.org \ --hbase-table wbgrp-journal-extract-0-qa \ -r hadoop \ -c mrjob.conf \ diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh index d5d3e35..d39a171 100755 --- a/sql/reingest_bulk.sh +++ b/sql/reingest_bulk.sh @@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_bulk_current.json \ | shuf \ | head -n1000000 \ | jq . -c \ - | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh index 20fd82b..8a2996c 100755 --- a/sql/reingest_quarterly.sh +++ b/sql/reingest_quarterly.sh @@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \ | shuf \ | head -n120000 \ | jq . -c \ - | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh index 6fb1e4b..c693a64 100755 --- a/sql/reingest_spn.sh +++ b/sql/reingest_spn.sh @@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_spn.json \ | shuf \ | head -n60000 \ | jq . -c \ - | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh index b60bd0e..d2e2444 100755 --- a/sql/reingest_weekly.sh +++ b/sql/reingest_weekly.sh @@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_weekly_current.json \ | shuf \ | head -n80000 \ | jq . -c \ - | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 |