diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-03-12 15:12:22 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-03-12 18:51:19 +0100 |
commit | 750b5d4c53d1075ddd31c3357dd5f690eb5951e0 (patch) | |
tree | 6fd3171825ccaa4f1b669a4f06a63d4f9e560d21 | |
parent | 6f329d030ab2ee89461bbc3fd9cdbd25b58b4021 (diff) | |
download | sandcrawler-750b5d4c53d1075ddd31c3357dd5f690eb5951e0.tar.gz sandcrawler-750b5d4c53d1075ddd31c3357dd5f690eb5951e0.zip |
topics: add pubmed ftp topic
PubmedFTPWorker replaced OAI recently. This documents the new topic.
-rw-r--r-- | kafka/topics.md | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/kafka/topics.md b/kafka/topics.md index 0ce8610..9cd43bd 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -55,8 +55,15 @@ retention (on both a size and time basis). => ~1TB capacity; 8x crossref partitions, 4x datacite => key compaction possible + fatcat-ENV.ftp-pubmed + => new citations from FTP server, from: ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/ + => raw XML, one record per message (PubmedArticle, up to 25k records/day and 650MB/day) + => key: PMID + => key compaction possible + fatcat-ENV.api-crossref-state fatcat-ENV.api-datacite-state + fatcat-ENV.ftp-pubmed-state fatcat-ENV.oaipmh-pubmed-state fatcat-ENV.oaipmh-arxiv-state fatcat-ENV.oaipmh-doaj-journals-state (DISABLED) @@ -135,11 +142,12 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.ftp-pubmed --config cleanup.policy=compact ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-crossref-state ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-datacite-state + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.ftp-pubmed-state ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-pubmed ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-arxiv ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-pubmed-state ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-arxiv-state - |