aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_harvest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:09 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:09 -0700
commit6464631dbe5c4afeb76f2f3c9d63b89f917c9a3b (patch)
tree633303839cafc7d901cf8565e034542606a5bb27 /python/fatcat_harvest.py
parentcdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7 (diff)
downloadfatcat-6464631dbe5c4afeb76f2f3c9d63b89f917c9a3b.tar.gz
fatcat-6464631dbe5c4afeb76f2f3c9d63b89f917c9a3b.zip
fmt (black): *.py
Diffstat (limited to 'python/fatcat_harvest.py')
-rwxr-xr-xpython/fatcat_harvest.py94
1 files changed, 56 insertions, 38 deletions
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index 0324aa52..91356aad 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -26,9 +26,11 @@ def run_crossref(args):
state_topic=f"fatcat-{args.env}.api-crossref-state",
contact_email=args.contact_email,
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_datacite(args):
worker = HarvestDataciteWorker(
kafka_hosts=args.kafka_hosts,
@@ -36,93 +38,108 @@ def run_datacite(args):
state_topic=f"fatcat-{args.env}.api-datacite-state",
contact_email=args.contact_email,
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_arxiv(args):
worker = HarvestArxivWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.oaipmh-arxiv",
state_topic=f"fatcat-{args.env}.oaipmh-arxiv-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_pubmed(args):
worker = PubmedFTPWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.ftp-pubmed",
state_topic=f"fatcat-{args.env}.ftp-pubmed-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_doaj_article(args):
worker = HarvestDoajArticleWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.oaipmh-doaj-article",
state_topic="fatcat-{args.env}.oaipmh-doaj-article-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_doaj_journal(args):
worker = HarvestDoajJournalWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.oaipmh-doaj-journal",
state_topic=f"fatcat-{args.env}.oaipmh-doaj-journal-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
def mkdate(raw):
return datetime.datetime.strptime(raw, "%Y-%m-%d").date()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('--start-date',
- default=None, type=mkdate,
- help="beginning of harvest period")
- parser.add_argument('--end-date',
- default=None, type=mkdate,
- help="end of harvest period")
- parser.add_argument('--contact-email',
- default="undefined", # better?
- help="contact email to use in API header")
- parser.add_argument('--continuous',
- action='store_true',
- help="continue harvesting indefinitely in a loop?")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "--start-date", default=None, type=mkdate, help="beginning of harvest period"
+ )
+ parser.add_argument("--end-date", default=None, type=mkdate, help="end of harvest period")
+ parser.add_argument(
+ "--contact-email",
+ default="undefined", # better?
+ help="contact email to use in API header",
+ )
+ parser.add_argument(
+ "--continuous", action="store_true", help="continue harvesting indefinitely in a loop?"
+ )
subparsers = parser.add_subparsers()
- sub_crossref = subparsers.add_parser('crossref',
- help="harvest DOI metadata from Crossref API (JSON)")
+ sub_crossref = subparsers.add_parser(
+ "crossref", help="harvest DOI metadata from Crossref API (JSON)"
+ )
sub_crossref.set_defaults(func=run_crossref)
- sub_datacite = subparsers.add_parser('datacite',
- help="harvest DOI metadata from Datacite API (JSON)")
+ sub_datacite = subparsers.add_parser(
+ "datacite", help="harvest DOI metadata from Datacite API (JSON)"
+ )
sub_datacite.set_defaults(func=run_datacite)
- sub_arxiv = subparsers.add_parser('arxiv',
- help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)")
+ sub_arxiv = subparsers.add_parser(
+ "arxiv", help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)"
+ )
sub_arxiv.set_defaults(func=run_arxiv)
- sub_pubmed = subparsers.add_parser('pubmed',
- help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)")
+ sub_pubmed = subparsers.add_parser(
+ "pubmed", help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)"
+ )
sub_pubmed.set_defaults(func=run_pubmed)
# DOAJ stuff disabled because API range-requests are broken
- #sub_doaj_article = subparsers.add_parser('doaj-article')
- #sub_doaj_article.set_defaults(func=run_doaj_article)
- #sub_doaj_journal = subparsers.add_parser('doaj-journal')
- #sub_doaj_journal.set_defaults(func=run_doaj_journal)
+ # sub_doaj_article = subparsers.add_parser('doaj-article')
+ # sub_doaj_article.set_defaults(func=run_doaj_article)
+ # sub_doaj_journal = subparsers.add_parser('doaj-journal')
+ # sub_doaj_journal.set_defaults(func=run_doaj_journal)
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -130,5 +147,6 @@ def main():
sys.exit(-1)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()