diff options
-rw-r--r-- | kafka/topics.md | 56 |
1 files changed, 35 insertions, 21 deletions
diff --git a/kafka/topics.md b/kafka/topics.md index 3a329f8..36337da 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -12,16 +12,18 @@ ENV below is one of `prod` or `qa`. All topics should default to `snappy` compression on-disk, and indefinite retention (on both a size and time basis). - sandcrawler-ENV.ungrobided - => PDF files in IA needing GROBID processing - => 50x partitions (huge! for worker parallelism) - => key: "sha1:<base32>" - - sandcrawler-ENV.grobid-output - => output of GROBID processing (from pdf-ungrobided feed) - => could get big; 16x partitions (to distribute data) + sandcrawler-ENV.grobid-output-pg + => output of GROBID processing using grobid_tool.py + => schema is sandcrawler-db style JSON: TEI-XML as a field + => expected to be large; 12 partitions => use GZIP compression (worth the overhead) - => key: "sha1:<base32>"; could compact + => key is sha1hex of PDF; enable key compaction + + sandcrawler-ENV.ungrobided-pg + => PDF files in IA needing GROBID processing + => schema is sandcrawler-db style JSON. Can be either `cdx` or `petabox` object + => fewer partitions with batch mode, but still a bunch (24?) + => key is sha1hex of PDF. enable time compaction (6 months?) fatcat-ENV.api-crossref fatcat-ENV.api-datacite @@ -31,16 +33,6 @@ retention (on both a size and time basis). => ~1TB capacity; 8x crossref partitions, 4x datacite => key compaction possible - fatcat-ENV.oaipmh-pubmed - fatcat-ENV.oaipmh-arxiv - fatcat-ENV.oaipmh-doaj-journals (DISABLED) - fatcat-ENV.oaipmh-doaj-articles (DISABLED) - => OAI-PMH harvester output - => full XML resource output (just the <<record> part?) - => key: identifier - => ~1TB capacity; 4x-8x partitions - => key compaction possible - fatcat-ENV.api-crossref-state fatcat-ENV.api-datacite-state fatcat-ENV.oaipmh-pubmed-state @@ -72,6 +64,28 @@ retention (on both a size and time basis). => key: fcid => 4x partitions +### Deprecated/Unused Topics + + sandcrawler-ENV.ungrobided + => PDF files in IA needing GROBID processing + => 50x partitions (huge! for worker parallelism) + => key: "sha1:<base32>" + + sandcrawler-ENV.grobid-output + => output of GROBID processing (from pdf-ungrobided feed) + => could get big; 16x partitions (to distribute data) + => use GZIP compression (worth the overhead) + => key: "sha1:<base32>"; could compact + + fatcat-ENV.oaipmh-pubmed + fatcat-ENV.oaipmh-arxiv + fatcat-ENV.oaipmh-doaj-journals (DISABLED) + fatcat-ENV.oaipmh-doaj-articles (DISABLED) + => OAI-PMH harvester output + => full XML resource output (just the <<record> part?) + => key: identifier + => ~1TB capacity; 4x-8x partitions + => key compaction possible ## Create fatcat QA topics @@ -82,8 +96,8 @@ exists`; this seems safe, and the settings won't be over-ridden. ssh misc-vm cd /srv/kafka-broker/kafka_2.12-2.0.0/bin/ - ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 50 --topic sandcrawler-qa.ungrobided - ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 16 --topic sandcrawler-qa.grobid-output --config compression.type=gzip + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ungrobided-pg + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.grobid-output-pg --config compression.type=gzip --config cleanup.policy=compact ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03 |