aboutsummaryrefslogtreecommitdiffstats
path: root/python/persist_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-12 19:40:55 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-12 19:42:43 -0800
commit94912e739c51d2fa4d5f9de878d0b0f0544a4459 (patch)
treeaf7803bee388beba7dd6dce2113e3632284537ac /python/persist_tool.py
parent6b3ce3169847a16fe6c0ab00f3a8af8b8ad099ab (diff)
downloadsandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.tar.gz
sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.zip
pdftrio basic python code
This is basically just a copy/paste of GROBID code, only simpler!
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-xpython/persist_tool.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 7187719..80b1156 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -63,6 +63,17 @@ def run_grobid_disk(args):
)
pusher.run()
+def run_pdftrio(args):
+ worker = PersistPdfTrioWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=100,
+ )
+ pusher.run()
+
def run_ingest_file_result(args):
worker = PersistIngestFileResultWorker(
db_url=args.db_url,
@@ -124,6 +135,13 @@ def main():
help="base directory to output into",
type=str)
+ sub_pdftrio = subparsers.add_parser('pdftrio',
+ help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)")
+ sub_pdftrio.set_defaults(func=run_pdftrio)
+ sub_pdftrio.add_argument('json_file',
+ help="pdftrio file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
help="backfill a ingest_file_result JSON dump into postgresql")
sub_ingest_file_result.set_defaults(func=run_ingest_file_result)