aboutsummaryrefslogtreecommitdiffstats
path: root/python/persist_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-26 21:16:59 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commit293d4b176855d400324559c814abd2e404cdf31e (patch)
treee88b72109cc209fe422264deabdd1e7ca937add7 /python/persist_tool.py
parent9fda5323046cb3f87f0c7c7e07eca283ca52ce99 (diff)
downloadsandcrawler-293d4b176855d400324559c814abd2e404cdf31e.tar.gz
sandcrawler-293d4b176855d400324559c814abd2e404cdf31e.zip
flush out minio helper, add to grobid persist
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-xpython/persist_tool.py22
1 files changed, 20 insertions, 2 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index d65fa53..309601b 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,12 +1,13 @@
#!/usr/bin/env python3
"""
-Commands for backfilling content from bulk files into postgresql and minio.
+Commands for backfilling content from bulk files into postgresql and s3 (minio).
Normally this is done by workers (in sandcrawler_worker.py) consuming from
Kafka feeds, but sometimes we have bulk processing output we want to backfill.
"""
+import os
import sys
import argparse
import datetime
@@ -36,6 +37,11 @@ def run_cdx(args):
def run_grobid(args):
worker = PersistGrobidWorker(
db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
)
pusher = JsonLinePusher(
worker,
@@ -61,6 +67,18 @@ def main():
parser.add_argument('--db-url',
help="postgresql database connection string",
default="postgres:///sandcrawler")
+ parser.add_argument('--s3-url',
+ help="S3 (minio) backend URL",
+ default="localhost:9000")
+ parser.add_argument('--s3-access-key',
+ help="S3 (minio) credential",
+ default=os.environ.get('MINIO_ACCESS_KEY'))
+ parser.add_argument('--s3-secret-key',
+ help="S3 (minio) credential",
+ default=os.environ.get('MINIO_SECRET_KEY'))
+ parser.add_argument('--s3-bucket',
+ help="S3 (minio) bucket to persist into",
+ default="sandcrawler-dev")
subparsers = parser.add_subparsers()
sub_cdx = subparsers.add_parser('cdx',
@@ -74,7 +92,7 @@ def main():
help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
sub_grobid = subparsers.add_parser('grobid',
- help="backfill a grobid JSON ('pg') dump into postgresql and minio")
+ help="backfill a grobid JSON ('pg') dump into postgresql and s3 (minio)")
sub_grobid.set_defaults(func=run_grobid)
sub_grobid.add_argument('json_file',
help="grobid file to import from (or '-' for stdin)",