summaryrefslogtreecommitdiffstats
path: root/python/fatcat_harvest.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-02-14 14:32:57 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-02-19 01:07:46 +0100
commit519b90d7f539b667e919c220a53626e7a4ac48bf (patch)
treeb7c3beed283d7dca732a8f2ab5b1dfe283bb69f3 /python/fatcat_harvest.py
parent4cbc94cd708c1db80a232150ab2cf56dddf83e62 (diff)
downloadfatcat-519b90d7f539b667e919c220a53626e7a4ac48bf.tar.gz
fatcat-519b90d7f539b667e919c220a53626e7a4ac48bf.zip
pubmed ftp harvest and KafkaBs4XmlPusher
* add PubmedFTPWorker * utils are currently stored alongside pubmed (e.g. ftpretr, xmlstream) but may live elsewhere, as they are more generic * add KafkaBs4XmlPusher
Diffstat (limited to 'python/fatcat_harvest.py')
-rwxr-xr-xpython/fatcat_harvest.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index 58bef9ca..4c4f34a1 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -6,7 +6,7 @@ import datetime
import raven
from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker,\
HarvestArxivWorker, HarvestPubmedWorker, HarvestDoajArticleWorker,\
- HarvestDoajJournalWorker
+ HarvestDoajJournalWorker, PubmedFTPWorker
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
sentry_client = raven.Client()
@@ -42,10 +42,17 @@ def run_arxiv(args):
worker.run(continuous=args.continuous)
def run_pubmed(args):
- worker = HarvestPubmedWorker(
+ # worker = HarvestPubmedWorker(
+ # kafka_hosts=args.kafka_hosts,
+ # produce_topic="fatcat-{}.oaipmh-pubmed".format(args.env),
+ # state_topic="fatcat-{}.oaipmh-pubmed-state".format(args.env),
+ # start_date=args.start_date,
+ # end_date=args.end_date)
+ # worker.run(continuous=args.continuous)
+ worker = PubmedFTPWorker(
kafka_hosts=args.kafka_hosts,
- produce_topic="fatcat-{}.oaipmh-pubmed".format(args.env),
- state_topic="fatcat-{}.oaipmh-pubmed-state".format(args.env),
+ produce_topic="fatcat-{}.ftp-pubmed".format(args.env),
+ state_topic="fatcat-{}.ftp-pubmed-state".format(args.env),
start_date=args.start_date,
end_date=args.end_date)
worker.run(continuous=args.continuous)