aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_harvest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-19 23:04:18 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-19 23:04:18 -0800
commite590eec544ab6f2e54e8770f01e64eef3158fdaa (patch)
tree5f1fe36a489e7e42642d96a3a719dcbd74d60901 /python/fatcat_harvest.py
parent65bdebea35f2ab3c9c8b0f8a8b0a9a577a36bee2 (diff)
downloadfatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.tar.gz
fatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.zip
initial OAI-PMH harvesters
Diffstat (limited to 'python/fatcat_harvest.py')
-rwxr-xr-xpython/fatcat_harvest.py57
1 files changed, 54 insertions, 3 deletions
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index f1bb3416..6ecc3ec6 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -3,11 +3,13 @@
import sys
import argparse
import datetime
-from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker
+from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker,\
+ HarvestArxivWorker, HarvestPubmedWorker, HarvestDoajArticleWorker,\
+ HarvestDoajJournalWorker
def run_crossref(args):
worker = HarvestCrossrefWorker(
- args.kafka_hosts,
+ kafka_hosts=args.kafka_hosts,
produce_topic="fatcat-{}.crossref".format(args.env),
state_topic="fatcat-{}.crossref-state".format(args.env),
contact_email=args.contact_email,
@@ -17,7 +19,7 @@ def run_crossref(args):
def run_datacite(args):
worker = HarvestDataciteWorker(
- args.kafka_hosts,
+ kafka_hosts=args.kafka_hosts,
produce_topic="fatcat-{}.datacite".format(args.env),
state_topic="fatcat-{}.datacite-state".format(args.env),
contact_email=args.contact_email,
@@ -25,6 +27,43 @@ def run_datacite(args):
end_date=args.end_date)
worker.run()
+def run_arxiv(args):
+ worker = HarvestArxivWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.arxiv".format(args.env),
+ state_topic="fatcat-{}.arxiv-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+def run_pubmed(args):
+ worker = HarvestPubmedWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.pubmed".format(args.env),
+ state_topic="fatcat-{}.pubmed-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+def run_doaj_article(args):
+ worker = HarvestDoajArticleWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.doaj-article".format(args.env),
+ state_topic="fatcat-{}.doaj-article-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+def run_doaj_journal(args):
+ worker = HarvestDoajJournalWorker(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic="fatcat-{}.doaj-journal".format(args.env),
+ state_topic="fatcat-{}.doaj-journal-state".format(args.env),
+ start_date=args.start_date,
+ end_date=args.end_date)
+ worker.run()
+
+
def mkdate(raw):
return datetime.datetime.strptime(raw, "%Y-%m-%d").date()
@@ -59,6 +98,18 @@ def main():
sub_datacite = subparsers.add_parser('datacite')
sub_datacite.set_defaults(func=run_datacite)
+ sub_arxiv = subparsers.add_parser('arxiv')
+ sub_arxiv.set_defaults(func=run_arxiv)
+
+ sub_pubmed = subparsers.add_parser('pubmed')
+ sub_pubmed.set_defaults(func=run_pubmed)
+
+ # DOAJ stuff disabled because API range-requests are broken
+ #sub_doaj_article = subparsers.add_parser('doaj-article')
+ #sub_doaj_article.set_defaults(func=run_doaj_article)
+ #sub_doaj_journal = subparsers.add_parser('doaj-journal')
+ #sub_doaj_journal.set_defaults(func=run_doaj_journal)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do!")