From 3adcaf9802928346dda597cefd4b66b2e62fa942 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 3 Nov 2020 19:12:14 -0800
Subject: refactor 'minio' to 'seaweedfs'; and BLOB env vars

This goes along with changes to ansible deployment to use the correct
key names and values.
---
 python/example.env            |  4 ++--
 python/persist_tool.py        | 18 +++++++++---------
 python/sandcrawler/minio.py   |  4 ++--
 python/sandcrawler/persist.py |  6 ++++--
 python/sandcrawler_worker.py  | 20 ++++++++++----------
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'python')

diff --git a/python/example.env b/python/example.env
index 4d3baa0..5064c96 100644
--- a/python/example.env
+++ b/python/example.env
@@ -1,5 +1,5 @@
-MINIO_ACCESS_KEY="minioadmin"
-MINIO_SECRET_KEY="minioadmin"
+SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin"
+SANDCRAWLER_BLOB_SECRET_KEY="minioadmin"
 IA_ACCESS_KEY="dummy"
 IA_SECRET_KEY="dummy"
 CDX_AUTH_TOKEN="dummy"
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 66e02aa..69e9374 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 """
-Commands for backfilling content from bulk files into postgresql and s3 (minio).
+Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
 
 Normally this is done by workers (in sandcrawler_worker.py) consuming from
 Kafka feeds, but sometimes we have bulk processing output we want to backfill.
@@ -120,16 +120,16 @@ def main():
         help="postgresql database connection string",
         default="postgres:///sandcrawler")
     parser.add_argument('--s3-url',
-        help="S3 (minio) backend URL",
+        help="S3 (seaweedfs) backend URL",
         default="localhost:9000")
     parser.add_argument('--s3-access-key',
-        help="S3 (minio) credential",
-        default=os.environ.get('MINIO_ACCESS_KEY'))
+        help="S3 (seaweedfs) credential",
+        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
     parser.add_argument('--s3-secret-key',
-        help="S3 (minio) credential",
-        default=os.environ.get('MINIO_SECRET_KEY'))
+        help="S3 (seaweedfs) credential",
+        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
     parser.add_argument('--s3-bucket',
-        help="S3 (minio) bucket to persist into",
+        help="S3 (seaweedfs) bucket to persist into",
         default="sandcrawler-dev")
     subparsers = parser.add_subparsers()
 
@@ -144,7 +144,7 @@ def main():
         help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
 
     sub_grobid = subparsers.add_parser('grobid',
-        help="backfill a grobid JSON ('pg') dump into postgresql and s3 (minio)")
+        help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
     sub_grobid.set_defaults(func=run_grobid)
     sub_grobid.add_argument('json_file',
         help="grobid file to import from (or '-' for stdin)",
@@ -180,7 +180,7 @@ def main():
         type=str)
 
     sub_pdftrio = subparsers.add_parser('pdftrio',
-        help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)")
+        help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
     sub_pdftrio.set_defaults(func=run_pdftrio)
     sub_pdftrio.add_argument('json_file',
         help="pdftrio file to import from (or '-' for stdin)",
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 8b02211..c7deea1 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -17,8 +17,8 @@ class SandcrawlerMinioClient(object):
         Example config:
 
             host="localhost:9000",
-            access_key=os.environ['MINIO_ACCESS_KEY'],
-            secret_key=os.environ['MINIO_SECRET_KEY'],
+            access_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+            secret_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
         """
         self.mc = minio.Minio(
             host_url,
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index fbc5273..aa05195 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -452,9 +452,11 @@ class PersistPdfTextWorker(SandcrawlerWorker):
 
 class PersistThumbnailWorker(SandcrawlerWorker):
     """
-    Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL table.
+    Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL
+    table.
 
-    This worker *must* be used with raw kakfa mode.
+    This worker *must* be used with raw kakfa mode; thumbnails are *not*
+    wrapped in JSON like most sandcrawler kafka messages.
     """
 
     def __init__(self, **kwargs):
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index a653771..537398e 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -3,7 +3,7 @@
 """
 These are generally for continuously running workers that consume from Kafka.
 Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
-or minio.
+or S3 (SeaweedFS).
 """
 
 import os
@@ -242,16 +242,16 @@ def main():
         help="postgresql database connection string",
         default="postgres:///sandcrawler")
     parser.add_argument('--s3-url',
-        help="S3 (minio) backend URL",
+        help="S3 (seaweedfs) backend URL",
         default="localhost:9000")
     parser.add_argument('--s3-access-key',
-        help="S3 (minio) credential",
-        default=os.environ.get('MINIO_ACCESS_KEY'))
+        help="S3 (seaweedfs) credential",
+        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
     parser.add_argument('--s3-secret-key',
-        help="S3 (minio) credential",
-        default=os.environ.get('MINIO_SECRET_KEY'))
+        help="S3 (seaweedfs) credential",
+        default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') or os.environ.get('MINIO_SECRET_KEY'))
     parser.add_argument('--s3-bucket',
-        help="S3 (minio) bucket to persist into",
+        help="S3 (seaweedfs) bucket to persist into",
         default="sandcrawler-dev")
     subparsers = parser.add_subparsers()
 
@@ -264,7 +264,7 @@ def main():
     sub_pdf_extract.set_defaults(func=run_pdf_extract)
 
     sub_persist_grobid = subparsers.add_parser('persist-grobid',
-        help="daemon that consumes GROBID output from Kafka and pushes to minio and postgres")
+        help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres")
     sub_persist_grobid.add_argument('--s3-only',
         action='store_true',
         help="only upload TEI-XML to S3 (don't write to database)")
@@ -274,7 +274,7 @@ def main():
     sub_persist_grobid.set_defaults(func=run_persist_grobid)
 
     sub_persist_pdftext = subparsers.add_parser('persist-pdftext',
-        help="daemon that consumes pdftext output from Kafka and pushes to minio and postgres")
+        help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres")
     sub_persist_pdftext.add_argument('--s3-only',
         action='store_true',
         help="only upload TEI-XML to S3 (don't write to database)")
@@ -284,7 +284,7 @@ def main():
     sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
 
     sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail',
-        help="daemon that consumes thumbnail output from Kafka and pushes to minio and postgres")
+        help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")
     sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
 
     sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
-- 
cgit v1.2.3