diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:40:55 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:42:43 -0800 |
commit | 94912e739c51d2fa4d5f9de878d0b0f0544a4459 (patch) | |
tree | af7803bee388beba7dd6dce2113e3632284537ac /python/persist_tool.py | |
parent | 6b3ce3169847a16fe6c0ab00f3a8af8b8ad099ab (diff) | |
download | sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.tar.gz sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.zip |
pdftrio basic python code
This is basically just a copy/paste of GROBID code, only simpler!
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-x | python/persist_tool.py | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py index 7187719..80b1156 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -63,6 +63,17 @@ def run_grobid_disk(args): ) pusher.run() +def run_pdftrio(args): + worker = PersistPdfTrioWorker( + db_url=args.db_url, + ) + pusher = JsonLinePusher( + worker, + args.json_file, + batch_size=100, + ) + pusher.run() + def run_ingest_file_result(args): worker = PersistIngestFileResultWorker( db_url=args.db_url, @@ -124,6 +135,13 @@ def main(): help="base directory to output into", type=str) + sub_pdftrio = subparsers.add_parser('pdftrio', + help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)") + sub_pdftrio.set_defaults(func=run_pdftrio) + sub_pdftrio.add_argument('json_file', + help="pdftrio file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + sub_ingest_file_result = subparsers.add_parser('ingest-file-result', help="backfill a ingest_file_result JSON dump into postgresql") sub_ingest_file_result.set_defaults(func=run_ingest_file_result) |