aboutsummaryrefslogtreecommitdiffstats
path: root/python/persist_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 19:12:14 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 19:12:16 -0800
commit3adcaf9802928346dda597cefd4b66b2e62fa942 (patch)
treedc7794d8d142a73a41fbad4221d058dc3567a2e4 /python/persist_tool.py
parente99d9f2fddcb8b52ba52128b290ec5e0f367392f (diff)
downloadsandcrawler-3adcaf9802928346dda597cefd4b66b2e62fa942.tar.gz
sandcrawler-3adcaf9802928346dda597cefd4b66b2e62fa942.zip
refactor 'minio' to 'seaweedfs'; and BLOB env vars
This goes along with changes to ansible deployment to use the correct key names and values.
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-xpython/persist_tool.py18
1 files changed, 9 insertions, 9 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 66e02aa..69e9374 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
-Commands for backfilling content from bulk files into postgresql and s3 (minio).
+Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
Normally this is done by workers (in sandcrawler_worker.py) consuming from
Kafka feeds, but sometimes we have bulk processing output we want to backfill.
@@ -120,16 +120,16 @@ def main():
help="postgresql database connection string",
default="postgres:///sandcrawler")
parser.add_argument('--s3-url',
- help="S3 (minio) backend URL",
+ help="S3 (seaweedfs) backend URL",
default="localhost:9000")
parser.add_argument('--s3-access-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_ACCESS_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
parser.add_argument('--s3-secret-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_SECRET_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
parser.add_argument('--s3-bucket',
- help="S3 (minio) bucket to persist into",
+ help="S3 (seaweedfs) bucket to persist into",
default="sandcrawler-dev")
subparsers = parser.add_subparsers()
@@ -144,7 +144,7 @@ def main():
help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
sub_grobid = subparsers.add_parser('grobid',
- help="backfill a grobid JSON ('pg') dump into postgresql and s3 (minio)")
+ help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_grobid.set_defaults(func=run_grobid)
sub_grobid.add_argument('json_file',
help="grobid file to import from (or '-' for stdin)",
@@ -180,7 +180,7 @@ def main():
type=str)
sub_pdftrio = subparsers.add_parser('pdftrio',
- help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)")
+ help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_pdftrio.set_defaults(func=run_pdftrio)
sub_pdftrio.add_argument('json_file',
help="pdftrio file to import from (or '-' for stdin)",