From 88f337f2cc40824ed3eaf32b1fec17c3b053bfdf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 19 Mar 2020 16:10:40 -0700 Subject: persist grobid: add option to skip S3 upload Motivation for this is that current S3 target (minio) is overloaded, with too many files on a single partition (80 million+). Going to look in to seaweedfs and other options, but for now stopping minio persist. Data is all stored in kafka anyways. --- python/persist_tool.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python/persist_tool.py') diff --git a/python/persist_tool.py b/python/persist_tool.py index 19e6dd7..869af06 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -41,6 +41,7 @@ def run_grobid(args): s3_access_key=args.s3_access_key, s3_secret_key=args.s3_secret_key, s3_only=args.s3_only, + db_only=args.db_only, ) pusher = JsonLinePusher( worker, @@ -135,6 +136,9 @@ def main(): sub_grobid.add_argument('--s3-only', action='store_true', help="only upload TEI-XML to S3 (don't write to database)") + sub_grobid.add_argument('--db-only', + action='store_true', + help="only write status to sandcrawler-db (don't save TEI-XML to S3)") sub_grobid_disk = subparsers.add_parser('grobid-disk', help="dump GRBOID output to (local) files on disk") -- cgit v1.2.3