aboutsummaryrefslogtreecommitdiffstats
path: root/python/persist_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-26 21:17:27 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commit6a5a0b090d7f303f3332759d63ffd0ac77cdd28c (patch)
tree0d3f418b78166c8411ef5c54852c01bb41eb8946 /python/persist_tool.py
parent293d4b176855d400324559c814abd2e404cdf31e (diff)
downloadsandcrawler-6a5a0b090d7f303f3332759d63ffd0ac77cdd28c.tar.gz
sandcrawler-6a5a0b090d7f303f3332759d63ffd0ac77cdd28c.zip
add PersistGrobidDiskWorker
To help with making dumps directly from Kafka (eg, for partner delivery)
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-xpython/persist_tool.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 309601b..29345e2 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -50,6 +50,20 @@ def run_grobid(args):
)
pusher.run()
+def run_grobid_disk(args):
+ """
+ Writes XML to individual files on disk, and also prints non-XML metadata to
+ stdout as JSON, which can be redirected to a separate file.
+ """
+ worker = PersistGrobidDiskWorker(
+ output_dir=args.output_dir,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ )
+ pusher.run()
+
def run_ingest_file_result(args):
worker = PersistIngestFileResultWorker(
db_url=args.db_url,
@@ -97,6 +111,19 @@ def main():
sub_grobid.add_argument('json_file',
help="grobid file to import from (or '-' for stdin)",
type=argparse.FileType('r'))
+ sub_grobid.add_argument('--s3-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+
+ sub_grobid_disk = subparsers.add_parser('grobid-disk',
+ help="dump GRBOID output to (local) files on disk")
+ sub_grobid_disk.set_defaults(func=run_grobid_disk)
+ sub_grobid_disk.add_argument('json_file',
+ help="grobid file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_grobid_disk.add_argument('output_dir',
+ help="base directory to output into",
+ type=str)
sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
help="backfill a ingest_file_result JSON dump into postgresql")