aboutsummaryrefslogtreecommitdiffstats
path: root/python/persist_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-17 11:12:59 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-17 11:12:59 -0700
commitd2fb570038ced65e6890e689e900a0f1aaed917c (patch)
tree706a7f6107ae30c21d783773fa5d418f18d2aac6 /python/persist_tool.py
parent82c7ec45dfbaa83e3b29b968846016cc6ae8e87f (diff)
downloadsandcrawler-d2fb570038ced65e6890e689e900a0f1aaed917c.tar.gz
sandcrawler-d2fb570038ced65e6890e689e900a0f1aaed917c.zip
add new pdf workers/persisters
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-xpython/persist_tool.py30
1 files changed, 30 insertions, 0 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 869af06..4d78314 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -75,6 +75,23 @@ def run_pdftrio(args):
)
pusher.run()
+def run_pdftext(args):
+ worker = PersistPdfTextWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=50,
+ )
+ pusher.run()
+
def run_ingest_file_result(args):
worker = PersistIngestFileResultWorker(
db_url=args.db_url,
@@ -140,6 +157,19 @@ def main():
action='store_true',
help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
+ sub_pdftext = subparsers.add_parser('pdftext',
+ help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (minio)")
+ sub_pdftext.set_defaults(func=run_pdftext)
+ sub_pdftext.add_argument('json_file',
+ help="pdftext file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_pdftext.add_argument('--s3-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+ sub_pdftext.add_argument('--db-only',
+ action='store_true',
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
+
sub_grobid_disk = subparsers.add_parser('grobid-disk',
help="dump GRBOID output to (local) files on disk")
sub_grobid_disk.set_defaults(func=run_grobid_disk)