From e4800fc4d0d0467d0e34a4059b941d001916e232 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Sep 2021 15:24:22 -0700 Subject: new 'daily' and 'priority' ingest request topics The old ingest request queue was always getting lopsided, suspect because it was scaled up (additional partitions) at some point in the past, hoping new topics will fix this. New '-priority' queue is like '-bulk', but for smaller-volume SPN-like requests. Eg, interactive mode. --- kafka/topics.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kafka') diff --git a/kafka/topics.md b/kafka/topics.md index a0ab6ff..a699e16 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -25,7 +25,8 @@ retention (on both a size and time basis). => fewer partitions with batch mode, but still a bunch (24?) => key is sha1hex of PDF. enable time compaction (6 months?) - sandcrawler-ENV.ingest-file-requests + sandcrawler-ENV.ingest-file-requests-daily + => was ingest-file-requests previously, but renamed/rebalanced => ingest requests from multiple sources; mostly continuous or pseudo-interactive => schema is JSON; see ingest proposal for fields. small objects. => fewer partitions with batch mode, but still a bunch (24) @@ -35,6 +36,10 @@ retention (on both a size and time basis). => ingest requests from bulk crawl sources; background processing => same as ingest-file-requests + sandcrawler-ENV.ingest-file-requests-priority + => ingest requests from bulk crawl sources; background processing + => same as ingest-file-requests + sandcrawler-ENV.ingest-file-results => ingest requests from multiple sources => schema is JSON; see ingest proposal for fields. small objects. @@ -171,8 +176,9 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ungrobided-pg ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.grobid-output-pg --config compression.type=gzip --config cleanup.policy=compact - ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ingest-file-requests --config retention.ms=7889400000 --config cleanup.policy=delete + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ingest-file-requests-daily --config retention.ms=7889400000 --config cleanup.policy=delete ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-requests-priority --config retention.ms=7889400000 --config cleanup.policy=delete ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact -- cgit v1.2.3