From 519b90d7f539b667e919c220a53626e7a4ac48bf Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 14 Feb 2020 14:32:57 +0100 Subject: pubmed ftp harvest and KafkaBs4XmlPusher * add PubmedFTPWorker * utils are currently stored alongside pubmed (e.g. ftpretr, xmlstream) but may live elsewhere, as they are more generic * add KafkaBs4XmlPusher --- python/fatcat_import.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'python/fatcat_import.py') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index ad4de0e2..eaab9cfe 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -39,14 +39,13 @@ def run_arxiv(args): ari = ArxivRawImporter(args.api, edit_batch_size=args.batch_size) if args.kafka_mode: - raise NotImplementedError - #KafkaBs4XmlPusher( - # ari, - # args.kafka_hosts, - # args.kafka_env, - # "api-arxiv", - # "fatcat-{}-import-arxiv".format(args.kafka_env), - #).run() + KafkaBs4XmlPusher( + ari, + args.kafka_hosts, + args.kafka_env, + "oaipmh-arxiv", + "fatcat-{}-import-arxiv".format(args.kafka_env), + ).run() else: Bs4XmlFilePusher(ari, args.xml_file, "record").run() @@ -57,14 +56,13 @@ def run_pubmed(args): do_updates=args.do_updates, lookup_refs=(not args.no_lookup_refs)) if args.kafka_mode: - raise NotImplementedError - #KafkaBs4XmlPusher( - # pi, - # args.kafka_hosts, - # args.kafka_env, - # "api-pubmed", - # "fatcat-{}import-arxiv".format(args.kafka_env), - #).run() + KafkaBs4XmlPusher( + pi, + args.kafka_hosts, + args.kafka_env, + "oaipmh-pubmed", + "fatcat-{}-import-pubmed".format(args.kafka_env), + ).run() else: Bs4XmlLargeFilePusher( pi, @@ -297,6 +295,7 @@ def main(): auth_var="FATCAT_AUTH_WORKER_ARXIV", ) sub_arxiv.add_argument('xml_file', + nargs='?', help="arXivRaw XML file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_arxiv.add_argument('--kafka-mode', @@ -310,6 +309,7 @@ def main(): auth_var="FATCAT_AUTH_WORKER_PUBMED", ) sub_pubmed.add_argument('xml_file', + nargs='?', help="Pubmed XML file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_pubmed.add_argument('issn_map_file', -- cgit v1.2.3