From d2fb570038ced65e6890e689e900a0f1aaed917c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 11:12:59 -0700 Subject: add new pdf workers/persisters --- python/persist_tool.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'python/persist_tool.py') diff --git a/python/persist_tool.py b/python/persist_tool.py index 869af06..4d78314 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -75,6 +75,23 @@ def run_pdftrio(args): ) pusher.run() +def run_pdftext(args): + worker = PersistPdfTextWorker( + db_url=args.db_url, + s3_url=args.s3_url, + s3_bucket=args.s3_bucket, + s3_access_key=args.s3_access_key, + s3_secret_key=args.s3_secret_key, + s3_only=args.s3_only, + db_only=args.db_only, + ) + pusher = JsonLinePusher( + worker, + args.json_file, + batch_size=50, + ) + pusher.run() + def run_ingest_file_result(args): worker = PersistIngestFileResultWorker( db_url=args.db_url, @@ -140,6 +157,19 @@ def main(): action='store_true', help="only write status to sandcrawler-db (don't save TEI-XML to S3)") + sub_pdftext = subparsers.add_parser('pdftext', + help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (minio)") + sub_pdftext.set_defaults(func=run_pdftext) + sub_pdftext.add_argument('json_file', + help="pdftext file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + sub_pdftext.add_argument('--s3-only', + action='store_true', + help="only upload TEI-XML to S3 (don't write to database)") + sub_pdftext.add_argument('--db-only', + action='store_true', + help="only write status to sandcrawler-db (don't save TEI-XML to S3)") + sub_grobid_disk = subparsers.add_parser('grobid-disk', help="dump GRBOID output to (local) files on disk") sub_grobid_disk.set_defaults(func=run_grobid_disk) -- cgit v1.2.3