From 293d4b176855d400324559c814abd2e404cdf31e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 26 Dec 2019 21:16:59 -0800 Subject: flush out minio helper, add to grobid persist --- python/persist_tool.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'python/persist_tool.py') diff --git a/python/persist_tool.py b/python/persist_tool.py index d65fa53..309601b 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 """ -Commands for backfilling content from bulk files into postgresql and minio. +Commands for backfilling content from bulk files into postgresql and s3 (minio). Normally this is done by workers (in sandcrawler_worker.py) consuming from Kafka feeds, but sometimes we have bulk processing output we want to backfill. """ +import os import sys import argparse import datetime @@ -36,6 +37,11 @@ def run_cdx(args): def run_grobid(args): worker = PersistGrobidWorker( db_url=args.db_url, + s3_url=args.s3_url, + s3_bucket=args.s3_bucket, + s3_access_key=args.s3_access_key, + s3_secret_key=args.s3_secret_key, + s3_only=args.s3_only, ) pusher = JsonLinePusher( worker, @@ -61,6 +67,18 @@ def main(): parser.add_argument('--db-url', help="postgresql database connection string", default="postgres:///sandcrawler") + parser.add_argument('--s3-url', + help="S3 (minio) backend URL", + default="localhost:9000") + parser.add_argument('--s3-access-key', + help="S3 (minio) credential", + default=os.environ.get('MINIO_ACCESS_KEY')) + parser.add_argument('--s3-secret-key', + help="S3 (minio) credential", + default=os.environ.get('MINIO_SECRET_KEY')) + parser.add_argument('--s3-bucket', + help="S3 (minio) bucket to persist into", + default="sandcrawler-dev") subparsers = parser.add_subparsers() sub_cdx = subparsers.add_parser('cdx', @@ -74,7 +92,7 @@ def main(): help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)") sub_grobid = subparsers.add_parser('grobid', - help="backfill a grobid JSON ('pg') dump into postgresql and minio") + help="backfill a grobid JSON ('pg') dump into postgresql and s3 (minio)") sub_grobid.set_defaults(func=run_grobid) sub_grobid.add_argument('json_file', help="grobid file to import from (or '-' for stdin)", -- cgit v1.2.3