From 94912e739c51d2fa4d5f9de878d0b0f0544a4459 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Feb 2020 19:40:55 -0800 Subject: pdftrio basic python code This is basically just a copy/paste of GROBID code, only simpler! --- python/persist_tool.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'python/persist_tool.py') diff --git a/python/persist_tool.py b/python/persist_tool.py index 7187719..80b1156 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -63,6 +63,17 @@ def run_grobid_disk(args): ) pusher.run() +def run_pdftrio(args): + worker = PersistPdfTrioWorker( + db_url=args.db_url, + ) + pusher = JsonLinePusher( + worker, + args.json_file, + batch_size=100, + ) + pusher.run() + def run_ingest_file_result(args): worker = PersistIngestFileResultWorker( db_url=args.db_url, @@ -124,6 +135,13 @@ def main(): help="base directory to output into", type=str) + sub_pdftrio = subparsers.add_parser('pdftrio', + help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)") + sub_pdftrio.set_defaults(func=run_pdftrio) + sub_pdftrio.add_argument('json_file', + help="pdftrio file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + sub_ingest_file_result = subparsers.add_parser('ingest-file-result', help="backfill a ingest_file_result JSON dump into postgresql") sub_ingest_file_result.set_defaults(func=run_ingest_file_result) -- cgit v1.2.3