From c61cb13ae42e3a170c29d4710ea2fc484081ee96 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Feb 2020 19:01:44 -0800 Subject: pdftrio proposal and start on schema+kafka --- kafka/topics.md | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kafka') diff --git a/kafka/topics.md b/kafka/topics.md index 2735d51..0ce8610 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -41,6 +41,12 @@ retention (on both a size and time basis). => 6 partitions => can't think of a good key, so none; no compaction + sandcrawler-ENV.pdftrio-output + => output of each pdftrio ML classification + => schema is JSON; see pdftrio proposal for fields. small objects. + => 6 partitions + => key is sha1hex of PDF; enable key compaction + fatcat-ENV.api-crossref fatcat-ENV.api-datacite => all new and updated DOIs (regardless of type) @@ -119,6 +125,8 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03 ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates -- cgit v1.2.3