aboutsummaryrefslogtreecommitdiffstats
path: root/python/grobid_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-26 12:00:01 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-26 12:00:01 -0700
commit37bf997dc0220a30605249655056e90f04e33366 (patch)
tree3f6a3586462d25c02b5fd219b0c754aef2976e3c /python/grobid_tool.py
parentc3c5a6ef57e83ff4395f9f87e7e372c6c371e4a5 (diff)
downloadsandcrawler-37bf997dc0220a30605249655056e90f04e33366.tar.gz
sandcrawler-37bf997dc0220a30605249655056e90f04e33366.zip
lots of grobid tool implementation (still WIP)
Diffstat (limited to 'python/grobid_tool.py')
-rwxr-xr-xpython/grobid_tool.py87
1 files changed, 87 insertions, 0 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
new file mode 100755
index 0000000..c9bcdc8
--- /dev/null
+++ b/python/grobid_tool.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+"""
+These are generally for running one-off tasks from the command line. Output
+might go to stdout, or might go to Kafka topic.
+"""
+
+import sys
+import argparse
+import datetime
+
+from sandcrawler import *
+
+
+def run_extract_json(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ wayback_client = WaybackClient()
+ worker = GrobidWorker(grobid_client, wayback_client, sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = JsonBatchPusher(worker, args.json_file, batch_size=30)
+ pusher.run()
+
+def run_extract_cdx(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ wayback_client = WaybackClient()
+ worker = GrobidWorker(grobid_client, wayback_client, sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = CdxLinePusher(multi_worker, args.cdx_file,
+ filter_http_statuses=[200], filter_mimetypes=['application/pdf'],
+ batch_size=30)
+ pusher.run()
+
+def run_extract_zipfile(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ worker = GrobidBlobWorker(grobid_client, sink=args.sink)
+ pusher = ZipfilePusher(worker, args.zip_file)
+ pusher.run()
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--kafka-mode',
+ action='store_true',
+ help="send output to Kafka (not stdout)")
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--kafka-env',
+ default="dev",
+ help="Kafka topic namespace to use (eg, prod, qa, dev)")
+ parser.add_argument('--grobid-host',
+ default="http://grobid.qa.fatcat.wiki",
+ help="GROBID API host/port")
+ subparsers = parser.add_subparsers()
+
+ sub_extract_json = subparsers.add_parser('extract-json')
+ sub_extract_json.set_defaults(func=run_extract_json)
+ sub_extract_json.add_argument('json_file',
+ help="JSON file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_extract_cdx = subparsers.add_parser('extract-cdx')
+ sub_extract_cdx.set_defaults(func=run_extract_cdx)
+ sub_extract_cdx.add_argument('cdx_file',
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_extract_zipfile = subparsers.add_parser('extract-zipfile')
+ sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
+ sub_extract_zipfile.add_argument('zip_file',
+ help="zipfile with PDFs to extract",
+ type=str)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do!")
+ sys.exit(-1)
+
+ args.sink = None
+ if args.kafka_mode:
+ produce_topic = "sandcrawler-{}.grobid-output-json".format(args.kafka_env)
+ args.sink = KafkaGrobidSink(kafka_hosts=args.kafka_hosts,
+ produce_topic=produce_topic)
+
+ args.func(args)
+
+if __name__ == '__main__':
+ main()