diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 11:12:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 11:12:59 -0700 |
commit | d2fb570038ced65e6890e689e900a0f1aaed917c (patch) | |
tree | 706a7f6107ae30c21d783773fa5d418f18d2aac6 /python/persist_tool.py | |
parent | 82c7ec45dfbaa83e3b29b968846016cc6ae8e87f (diff) | |
download | sandcrawler-d2fb570038ced65e6890e689e900a0f1aaed917c.tar.gz sandcrawler-d2fb570038ced65e6890e689e900a0f1aaed917c.zip |
add new pdf workers/persisters
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-x | python/persist_tool.py | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py index 869af06..4d78314 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -75,6 +75,23 @@ def run_pdftrio(args): ) pusher.run() +def run_pdftext(args): + worker = PersistPdfTextWorker( + db_url=args.db_url, + s3_url=args.s3_url, + s3_bucket=args.s3_bucket, + s3_access_key=args.s3_access_key, + s3_secret_key=args.s3_secret_key, + s3_only=args.s3_only, + db_only=args.db_only, + ) + pusher = JsonLinePusher( + worker, + args.json_file, + batch_size=50, + ) + pusher.run() + def run_ingest_file_result(args): worker = PersistIngestFileResultWorker( db_url=args.db_url, @@ -140,6 +157,19 @@ def main(): action='store_true', help="only write status to sandcrawler-db (don't save TEI-XML to S3)") + sub_pdftext = subparsers.add_parser('pdftext', + help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (minio)") + sub_pdftext.set_defaults(func=run_pdftext) + sub_pdftext.add_argument('json_file', + help="pdftext file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + sub_pdftext.add_argument('--s3-only', + action='store_true', + help="only upload TEI-XML to S3 (don't write to database)") + sub_pdftext.add_argument('--db-only', + action='store_true', + help="only write status to sandcrawler-db (don't save TEI-XML to S3)") + sub_grobid_disk = subparsers.add_parser('grobid-disk', help="dump GRBOID output to (local) files on disk") sub_grobid_disk.set_defaults(func=run_grobid_disk) |