From 2e6254251549dd932659e0a7f8bf6cda8e8abdf3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 21 Nov 2018 13:07:34 -0800 Subject: rename grobided to grobid-output --- kafka/grobid_kafka_notes.txt | 2 +- kafka/topics.md | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kafka') diff --git a/kafka/grobid_kafka_notes.txt b/kafka/grobid_kafka_notes.txt index 26c450f..0e565aa 100644 --- a/kafka/grobid_kafka_notes.txt +++ b/kafka/grobid_kafka_notes.txt @@ -39,4 +39,4 @@ Check ungrobided topic: Check grobid output: - kafkacat -C -b localhost:9092 -t sandcrawler-qa.grobided + kafkacat -C -b localhost:9092 -t sandcrawler-qa.grobid-output diff --git a/kafka/topics.md b/kafka/topics.md index 6361cc8..1ea2c83 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -17,7 +17,7 @@ retention (on both a size and time basis). => 50x partitions (huge! for worker parallelism) => key: "sha1:" - sandcrawler-ENV.grobided + sandcrawler-ENV.grobid-output => output of GROBID processing (from pdf-ungrobided feed) => could get big; 16x partitions (to distribute data) => use GZIP compression (worth the overhead) @@ -65,11 +65,15 @@ retention (on both a size and time basis). ## Create fatcat QA topics +If you run these commands for an existing topic, you'll get something like +`Error while executing topic command : Topic 'fatcat-qa.changelog' already +exists`; this seems safe, and the settings won't be over-ridden. + ssh misc-vm cd /srv/kafka-broker/kafka_2.12-2.0.0/bin/ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 50 --topic sandcrawler-qa.ungrobided - ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 16 --topic sandcrawler-qa.grobided --config compression.type=gzip + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 16 --topic sandcrawler-qa.grobid-output --config compression.type=gzip ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates -- cgit v1.2.3