diff options
Diffstat (limited to 'kafka')
-rw-r--r-- | kafka/debugging_issues.txt | 9 | ||||
-rw-r--r-- | kafka/howto_rebalance.md | 43 | ||||
-rw-r--r-- | kafka/monitoring_commands.md | 4 | ||||
-rw-r--r-- | kafka/topics.md | 107 |
4 files changed, 158 insertions, 5 deletions
diff --git a/kafka/debugging_issues.txt b/kafka/debugging_issues.txt index 1af490e..007c786 100644 --- a/kafka/debugging_issues.txt +++ b/kafka/debugging_issues.txt @@ -1,4 +1,13 @@ +## 2020-11-12 + +To reset a consumer group to the offsets from a specific date (or datetime), +use: + + ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-grobid-s3 --reset-offsets --all-topics --to-datetime 2020-11-09T00:00:00.000 + +Add `--execute` to actually commit the change. + ## 2018-12-02 Had been having some troubles with consumer group partition assignments with diff --git a/kafka/howto_rebalance.md b/kafka/howto_rebalance.md new file mode 100644 index 0000000..093740a --- /dev/null +++ b/kafka/howto_rebalance.md @@ -0,0 +1,43 @@ + +## Rebalance Storage Between Brokers (kafka-manager web) + +For each topic you want to rebalance (eg, the large or high-throughput ones), +go to the topic page and do the blue "reassign partitions" button (or +potentially "generate" or "manual"). + +Monitor progress with the "Reassign Partitions" link at top of the page. + +Finally, run a preferred replica election after partition movement is complete. + +## Rebalance Storage Between Brokers (CLI) + +For example, after adding or removing brokers from the cluster. + +Create a list of topics to move, and put it in `/tmp/topics_to_move.json`: + + { + "version": 1, + "topics": [ + {"topic": "sandcrawler-shadow.grobid-output"}, + {"topic": "fatcat-prod.api-crossref"} + ] + } + +On a kafka broker, go to `/srv/kafka-broker/kafka-*/bin`, generate a plan, then +inspect the output: + + ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --broker-list "280,281,284,285,263" --topics-to-move-json-file /tmp/topics_to_move.json --generate > /tmp/reassignment-plan.json + cat /tmp/reassignment-plan.json | rg '^\{' | head -n1 | jq . > /tmp/old-plan.json + cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 | jq . > /tmp/new-plan.json + cat /tmp/reassignment-plan.json | rg '^\{' | jq . + +If that looks good, start the rebalance: + + ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --execute + +Then monitor progress: + + ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --verify + +Finally, run a preferred replica election after partition movement is complete. +Currently do this through the web interface (linked above). diff --git a/kafka/monitoring_commands.md b/kafka/monitoring_commands.md new file mode 100644 index 0000000..c0c330f --- /dev/null +++ b/kafka/monitoring_commands.md @@ -0,0 +1,4 @@ + + kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.status, .base_url]' -c + + kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.request.ingest_request_source, .status, .request.base_url, .terminal.terminal_url]' -c diff --git a/kafka/topics.md b/kafka/topics.md index 36337da..a699e16 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -25,6 +25,63 @@ retention (on both a size and time basis). => fewer partitions with batch mode, but still a bunch (24?) => key is sha1hex of PDF. enable time compaction (6 months?) + sandcrawler-ENV.ingest-file-requests-daily + => was ingest-file-requests previously, but renamed/rebalanced + => ingest requests from multiple sources; mostly continuous or pseudo-interactive + => schema is JSON; see ingest proposal for fields. small objects. + => fewer partitions with batch mode, but still a bunch (24) + => can't think of a good key, so none. enable time compaction (3-6 months?) + + sandcrawler-ENV.ingest-file-requests-bulk + => ingest requests from bulk crawl sources; background processing + => same as ingest-file-requests + + sandcrawler-ENV.ingest-file-requests-priority + => ingest requests from bulk crawl sources; background processing + => same as ingest-file-requests + + sandcrawler-ENV.ingest-file-results + => ingest requests from multiple sources + => schema is JSON; see ingest proposal for fields. small objects. + => 6 partitions + => can't think of a good key, so none; no compaction + + sandcrawler-ENV.pdftrio-output + => output of each pdftrio ML classification + => schema is JSON; see pdftrio proposal for fields. small objects. + => 6 partitions + => key is sha1hex of PDF; enable key compaction + + sandcrawler-ENV.unextracted + => PDF files in IA needing extraction (thumbnails and text) + => schema is sandcrawler-db style JSON. Can be either `cdx` or `petabox` object + => fewer partitions with batch mode, but still a bunch (12? 24?) + => key is sha1hex of PDF. enable time compaction (6 months?) + + sandcrawler-ENV.pdf-text + => fulltext (raw text) and PDF metadata for pdfs + => schema is JSON; see pdf_meta proposal for fields. large objects. + => 12 partitions + => key is sha1hex of PDF; enable key compaction; gzip compression + + sandcrawler-ENV.xml-doc + => fulltext XML; mostly JATS XML + => schema is JSON, with 'jats_xml' field containing the XML as a string + => 6 partitions + => key is sha1hex of XML document; enable key compaction; gzip compression + + sandcrawler-ENV.html-teixml + => extracted fulltext from HTML; mostly TEI-XML + => schema is JSON, with 'tei_xml' field containing the XML as a string + => 6 partitions + => key is sha1hex of source HTML document; enable key compaction; gzip compression + + sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE + => thumbnail images (eg, png, jpg) from PDFs + => raw bytes in message (no JSON or other wrapping). fields average 10 KByte + => 12 partitions; expect a TByte or so total + => key is sha1hex of PDF; enable key compaction; no compression + fatcat-ENV.api-crossref fatcat-ENV.api-datacite => all new and updated DOIs (regardless of type) @@ -33,8 +90,15 @@ retention (on both a size and time basis). => ~1TB capacity; 8x crossref partitions, 4x datacite => key compaction possible + fatcat-ENV.ftp-pubmed + => new citations from FTP server, from: ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/ + => raw XML, one record per message (PubmedArticle, up to 25k records/day and 650MB/day) + => key: PMID + => key compaction possible + fatcat-ENV.api-crossref-state fatcat-ENV.api-datacite-state + fatcat-ENV.ftp-pubmed-state fatcat-ENV.oaipmh-pubmed-state fatcat-ENV.oaipmh-arxiv-state fatcat-ENV.oaipmh-doaj-journals-state (DISABLED) @@ -54,15 +118,28 @@ retention (on both a size and time basis). => v03 is newer v0.3.0 API schema (backwards incompatible) => key: fcid => 8x partitions - fatcat-ENV.work-updates - => key: fcid - => 8x partitions fatcat-ENV.container-updates => key: fcid => 4x partitions fatcat-ENV.file-updates => key: fcid => 4x partitions + fatcat-ENV.work-ident-updates + => work identifiers when updated and needs re-indexing (eg, in scholar) + => 6x partitions + => key: doc ident ("work_{ident}") + => key compaction possible; long retention + + scholar-ENV.sim-updates + => 6x partitions + => key: "sim_item_{}" + => key compaction possible; long retention + scholar-ENV.update-docs + => 12x partitions + => key: scholar doc identifer + => gzip compression + => key compaction possible + => short time-based retention (2 months?) ### Deprecated/Unused Topics @@ -99,19 +176,39 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ungrobided-pg ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.grobid-output-pg --config compression.type=gzip --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ingest-file-requests-daily --config retention.ms=7889400000 --config cleanup.policy=delete + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-requests-priority --config retention.ms=7889400000 --config cleanup.policy=delete + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results + + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03 - ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.file-updates ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.container-updates + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic fatcat-qa.work-ident-updates ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref - ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.ftp-pubmed --config cleanup.policy=compact ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-crossref-state ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-datacite-state + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.ftp-pubmed-state ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-pubmed ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-arxiv ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-pubmed-state ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-arxiv-state + # only 3 partitions in QA + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-text --config compression.type=gzip --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-thumbnail-180px-jpg --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.unextracted + + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic scholar-qa.sim-updates + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic scholar-qa.update-docs --config compression.type=gzip --config cleanup.policy=compact --config retention.ms=7889400000 + + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.xml-doc --config compression.type=gzip --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.html-teixml --config compression.type=gzip --config cleanup.policy=compact + |