diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-11-19 23:04:18 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-11-19 23:04:18 -0800 |
commit | e590eec544ab6f2e54e8770f01e64eef3158fdaa (patch) | |
tree | 5f1fe36a489e7e42642d96a3a719dcbd74d60901 /python/fatcat_harvest.py | |
parent | 65bdebea35f2ab3c9c8b0f8a8b0a9a577a36bee2 (diff) | |
download | fatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.tar.gz fatcat-e590eec544ab6f2e54e8770f01e64eef3158fdaa.zip |
initial OAI-PMH harvesters
Diffstat (limited to 'python/fatcat_harvest.py')
-rwxr-xr-x | python/fatcat_harvest.py | 57 |
1 files changed, 54 insertions, 3 deletions
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py index f1bb3416..6ecc3ec6 100755 --- a/python/fatcat_harvest.py +++ b/python/fatcat_harvest.py @@ -3,11 +3,13 @@ import sys import argparse import datetime -from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker +from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker,\ + HarvestArxivWorker, HarvestPubmedWorker, HarvestDoajArticleWorker,\ + HarvestDoajJournalWorker def run_crossref(args): worker = HarvestCrossrefWorker( - args.kafka_hosts, + kafka_hosts=args.kafka_hosts, produce_topic="fatcat-{}.crossref".format(args.env), state_topic="fatcat-{}.crossref-state".format(args.env), contact_email=args.contact_email, @@ -17,7 +19,7 @@ def run_crossref(args): def run_datacite(args): worker = HarvestDataciteWorker( - args.kafka_hosts, + kafka_hosts=args.kafka_hosts, produce_topic="fatcat-{}.datacite".format(args.env), state_topic="fatcat-{}.datacite-state".format(args.env), contact_email=args.contact_email, @@ -25,6 +27,43 @@ def run_datacite(args): end_date=args.end_date) worker.run() +def run_arxiv(args): + worker = HarvestArxivWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.arxiv".format(args.env), + state_topic="fatcat-{}.arxiv-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + +def run_pubmed(args): + worker = HarvestPubmedWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.pubmed".format(args.env), + state_topic="fatcat-{}.pubmed-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + +def run_doaj_article(args): + worker = HarvestDoajArticleWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.doaj-article".format(args.env), + state_topic="fatcat-{}.doaj-article-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + +def run_doaj_journal(args): + worker = HarvestDoajJournalWorker( + kafka_hosts=args.kafka_hosts, + produce_topic="fatcat-{}.doaj-journal".format(args.env), + state_topic="fatcat-{}.doaj-journal-state".format(args.env), + start_date=args.start_date, + end_date=args.end_date) + worker.run() + + def mkdate(raw): return datetime.datetime.strptime(raw, "%Y-%m-%d").date() @@ -59,6 +98,18 @@ def main(): sub_datacite = subparsers.add_parser('datacite') sub_datacite.set_defaults(func=run_datacite) + sub_arxiv = subparsers.add_parser('arxiv') + sub_arxiv.set_defaults(func=run_arxiv) + + sub_pubmed = subparsers.add_parser('pubmed') + sub_pubmed.set_defaults(func=run_pubmed) + + # DOAJ stuff disabled because API range-requests are broken + #sub_doaj_article = subparsers.add_parser('doaj-article') + #sub_doaj_article.set_defaults(func=run_doaj_article) + #sub_doaj_journal = subparsers.add_parser('doaj-journal') + #sub_doaj_journal.set_defaults(func=run_doaj_journal) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") |