From 05bd7cbcc62588e431c5efd533189e246b2a997e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 12:54:37 -0700 Subject: make fmt --- python/grobid_tool.py | 64 +++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'python/grobid_tool.py') diff --git a/python/grobid_tool.py b/python/grobid_tool.py index 0084330..4ba9540 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ These are generally for running one-off tasks from the command line. Output might go to stdout, or might go to Kafka topic. @@ -30,6 +29,7 @@ def run_extract_json(args): pusher = JsonLinePusher(worker, args.json_file) pusher.run() + def run_extract_cdx(args): grobid_client = GrobidClient(host_url=args.grobid_host) wayback_client = WaybackClient() @@ -53,6 +53,7 @@ def run_extract_cdx(args): ) pusher.run() + def run_extract_zipfile(args): grobid_client = GrobidClient(host_url=args.grobid_host) if args.jobs > 1: @@ -65,6 +66,7 @@ def run_extract_zipfile(args): pusher = ZipfilePusher(worker, args.zip_file) pusher.run() + def run_transform(args): grobid_client = GrobidClient() for line in args.json_file: @@ -82,52 +84,54 @@ def run_transform(args): def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--kafka-mode', - action='store_true', - help="send output to Kafka (not stdout)") + action='store_true', + help="send output to Kafka (not stdout)") parser.add_argument('--kafka-hosts', - default="localhost:9092", - help="list of Kafka brokers (host/port) to use") + default="localhost:9092", + help="list of Kafka brokers (host/port) to use") parser.add_argument('--kafka-env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('-j', '--jobs', - default=8, type=int, - help="parallelism for batch CPU jobs") + default="dev", + help="Kafka topic namespace to use (eg, prod, qa, dev)") + parser.add_argument('-j', + '--jobs', + default=8, + type=int, + help="parallelism for batch CPU jobs") parser.add_argument('--grobid-host', - default="http://grobid.qa.fatcat.wiki", - help="GROBID API host/port") + default="http://grobid.qa.fatcat.wiki", + help="GROBID API host/port") subparsers = parser.add_subparsers() - sub_extract_json = subparsers.add_parser('extract-json', + sub_extract_json = subparsers.add_parser( + 'extract-json', help="for each JSON line with CDX info, fetches PDF and does GROBID extraction") sub_extract_json.set_defaults(func=run_extract_json) sub_extract_json.add_argument('json_file', - help="JSON file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + help="JSON file to import from (or '-' for stdin)", + type=argparse.FileType('r')) - sub_extract_cdx = subparsers.add_parser('extract-cdx', - help="for each CDX line, fetches PDF and does GROBID extraction") + sub_extract_cdx = subparsers.add_parser( + 'extract-cdx', help="for each CDX line, fetches PDF and does GROBID extraction") sub_extract_cdx.set_defaults(func=run_extract_cdx) sub_extract_cdx.add_argument('cdx_file', - help="CDX file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + help="CDX file to import from (or '-' for stdin)", + type=argparse.FileType('r')) - sub_extract_zipfile = subparsers.add_parser('extract-zipfile', + sub_extract_zipfile = subparsers.add_parser( + 'extract-zipfile', help="opens zipfile, iterates over PDF files inside and does GROBID extract for each") sub_extract_zipfile.set_defaults(func=run_extract_zipfile) - sub_extract_zipfile.add_argument('zip_file', - help="zipfile with PDFs to extract", - type=str) + sub_extract_zipfile.add_argument('zip_file', help="zipfile with PDFs to extract", type=str) sub_transform = subparsers.add_parser('transform') sub_transform.set_defaults(func=run_transform) sub_transform.add_argument('--metadata-only', - action='store_true', - help="Only pass through bibliographic metadata, not fulltext") - sub_transform.add_argument('json_file', + action='store_true', + help="Only pass through bibliographic metadata, not fulltext") + sub_transform.add_argument( + 'json_file', help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field", type=argparse.FileType('r')) @@ -140,10 +144,10 @@ def main(): if args.kafka_mode: produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env) print("Running in kafka output mode, publishing to {}\n".format(produce_topic)) - args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, - produce_topic=produce_topic) + args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic) args.func(args) + if __name__ == '__main__': main() -- cgit v1.2.3