From 7186eb098b1e3f62288febe27db73685dacf1a2f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 20 Nov 2018 14:19:40 -0800 Subject: kafka notes --- kafka/topics.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 kafka/topics.md (limited to 'kafka/topics.md') diff --git a/kafka/topics.md b/kafka/topics.md new file mode 100644 index 0000000..6361cc8 --- /dev/null +++ b/kafka/topics.md @@ -0,0 +1,86 @@ + +This file lists all the Kafka topics currently used by sandcrawler (and +fatcat). + +NOTE: should use `.` or `_` in topic names, but not both. We chose to use `.` + +ENV below is one of `prod` or `qa`. + + +## Topic List + +All topics should default to `snappy` compression on-disk, and indefinite +retention (on both a size and time basis). + + sandcrawler-ENV.ungrobided + => PDF files in IA needing GROBID processing + => 50x partitions (huge! for worker parallelism) + => key: "sha1:" + + sandcrawler-ENV.grobided + => output of GROBID processing (from pdf-ungrobided feed) + => could get big; 16x partitions (to distribute data) + => use GZIP compression (worth the overhead) + => key: "sha1:"; could compact + + fatcat-ENV.api-crossref + fatcat-ENV.api-datacite + => all new and updated DOIs (regardless of type) + => full raw crossref/datacite API objects (JSON) + => key: lower-case DOI + => ~1TB capacity; 8x crossref partitions, 4x datacite + => key compaction possible + + fatcat-ENV.oaipmh-pubmed + fatcat-ENV.oaipmh-arxiv + fatcat-ENV.oaipmh-doaj-journals (DISABLED) + fatcat-ENV.oaipmh-doaj-articles (DISABLED) + => OAI-PMH harvester output + => full XML resource output (just the < part?) + => key: identifier + => ~1TB capacity; 4x-8x partitions + => key compaction possible + + fatcat-ENV.api-crossref-state + fatcat-ENV.api-datacite-state + fatcat-ENV.oaipmh-pubmed-state + fatcat-ENV.oaipmh-arxiv-state + fatcat-ENV.oaipmh-doaj-journals-state (DISABLED) + fatcat-ENV.oaipmh-doaj-articles-state (DISABLED) + => serialized harvester state for ingesters + => custom JSON + => key: timespan? nothing to start + => 1x partitions; time/space limit Ok + + fatcat-ENV.changelog + => small-ish objects (not fully expanded/hydrated) + => single partition + => key: could be changelog index (integer, as string) + + fatcat-ENV.release-updates + => contains "fully" expanded JSON objects + => key: fcid + => 8x partitions + + +## Create fatcat QA topics + + ssh misc-vm + cd /srv/kafka-broker/kafka_2.12-2.0.0/bin/ + + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 50 --topic sandcrawler-qa.ungrobided + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 16 --topic sandcrawler-qa.grobided --config compression.type=gzip + + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates + + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-crossref-state + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-datacite-state + + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-pubmed + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-arxiv + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-pubmed-state + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-arxiv-state + -- cgit v1.2.3