aboutsummaryrefslogtreecommitdiffstats
path: root/kafka
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-03-12 15:12:22 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-03-12 18:51:19 +0100
commit750b5d4c53d1075ddd31c3357dd5f690eb5951e0 (patch)
tree6fd3171825ccaa4f1b669a4f06a63d4f9e560d21 /kafka
parent6f329d030ab2ee89461bbc3fd9cdbd25b58b4021 (diff)
downloadsandcrawler-750b5d4c53d1075ddd31c3357dd5f690eb5951e0.tar.gz
sandcrawler-750b5d4c53d1075ddd31c3357dd5f690eb5951e0.zip
topics: add pubmed ftp topic
PubmedFTPWorker replaced OAI recently. This documents the new topic.
Diffstat (limited to 'kafka')
-rw-r--r--kafka/topics.md10
1 files changed, 9 insertions, 1 deletions
diff --git a/kafka/topics.md b/kafka/topics.md
index 0ce8610..9cd43bd 100644
--- a/kafka/topics.md
+++ b/kafka/topics.md
@@ -55,8 +55,15 @@ retention (on both a size and time basis).
=> ~1TB capacity; 8x crossref partitions, 4x datacite
=> key compaction possible
+ fatcat-ENV.ftp-pubmed
+ => new citations from FTP server, from: ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
+ => raw XML, one record per message (PubmedArticle, up to 25k records/day and 650MB/day)
+ => key: PMID
+ => key compaction possible
+
fatcat-ENV.api-crossref-state
fatcat-ENV.api-datacite-state
+ fatcat-ENV.ftp-pubmed-state
fatcat-ENV.oaipmh-pubmed-state
fatcat-ENV.oaipmh-arxiv-state
fatcat-ENV.oaipmh-doaj-journals-state (DISABLED)
@@ -135,11 +142,12 @@ exists`; this seems safe, and the settings won't be over-ridden.
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.ftp-pubmed --config cleanup.policy=compact
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-crossref-state
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-datacite-state
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.ftp-pubmed-state
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-pubmed
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-arxiv
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-pubmed-state
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-arxiv-state
-