aboutsummaryrefslogtreecommitdiffstats
path: root/kafka
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-27 15:54:10 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-27 15:54:10 -0700
commit8e30d5ff73703a74c939b398e8c73b6f43c87fe0 (patch)
tree3b2b921a983342844b255d1791741f896e9f8532 /kafka
parent12a51fd28ca64338fca040ab7c470a70bf7a2a1b (diff)
downloadsandcrawler-8e30d5ff73703a74c939b398e8c73b6f43c87fe0.tar.gz
sandcrawler-8e30d5ff73703a74c939b398e8c73b6f43c87fe0.zip
kafka topics for fatcat -> scholar pipeline
Diffstat (limited to 'kafka')
-rw-r--r--kafka/topics.md19
1 files changed, 19 insertions, 0 deletions
diff --git a/kafka/topics.md b/kafka/topics.md
index ebe7a61..7a34c83 100644
--- a/kafka/topics.md
+++ b/kafka/topics.md
@@ -110,6 +110,22 @@ retention (on both a size and time basis).
fatcat-ENV.file-updates
=> key: fcid
=> 4x partitions
+ fatcat-ENV.work-ident-updates
+ => work identifiers when updated and needs re-indexing (eg, in scholar)
+ => 6x partitions
+ => key: doc ident ("work_{ident}")
+ => key compaction possible; long retention
+
+ scholar-ENV.sim-updates
+ => 6x partitions
+ => key: "sim_item_{}"
+ => key compaction possible; long retention
+ scholar-ENV.update-docs
+ => 12x partitions
+ => key: scholar doc identifer
+ => gzip compression
+ => key compaction possible
+ => short time-based retention (2 months?)
### Deprecated/Unused Topics
@@ -157,6 +173,7 @@ exists`; this seems safe, and the settings won't be over-ridden.
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.file-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.container-updates
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic fatcat-qa.work-ident-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite --config cleanup.policy=compact
@@ -175,3 +192,5 @@ exists`; this seems safe, and the settings won't be over-ridden.
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-thumbnail-180px-jpg --config cleanup.policy=compact
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.unextracted
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic scholar-qa.sim-updates
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic scholar-qa.update-docs --config compression.type=gzip --config cleanup.policy=compact --config retention.ms=7889400000