aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 19:12:14 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 19:12:16 -0800
commit3adcaf9802928346dda597cefd4b66b2e62fa942 (patch)
treedc7794d8d142a73a41fbad4221d058dc3567a2e4 /python/sandcrawler
parente99d9f2fddcb8b52ba52128b290ec5e0f367392f (diff)
downloadsandcrawler-3adcaf9802928346dda597cefd4b66b2e62fa942.tar.gz
sandcrawler-3adcaf9802928346dda597cefd4b66b2e62fa942.zip
refactor 'minio' to 'seaweedfs'; and BLOB env vars
This goes along with changes to ansible deployment to use the correct key names and values.
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/minio.py4
-rw-r--r--python/sandcrawler/persist.py6
2 files changed, 6 insertions, 4 deletions
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 8b02211..c7deea1 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -17,8 +17,8 @@ class SandcrawlerMinioClient(object):
Example config:
host="localhost:9000",
- access_key=os.environ['MINIO_ACCESS_KEY'],
- secret_key=os.environ['MINIO_SECRET_KEY'],
+ access_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+ secret_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
"""
self.mc = minio.Minio(
host_url,
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index fbc5273..aa05195 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -452,9 +452,11 @@ class PersistPdfTextWorker(SandcrawlerWorker):
class PersistThumbnailWorker(SandcrawlerWorker):
"""
- Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL table.
+ Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL
+ table.
- This worker *must* be used with raw kakfa mode.
+ This worker *must* be used with raw kakfa mode; thumbnails are *not*
+ wrapped in JSON like most sandcrawler kafka messages.
"""
def __init__(self, **kwargs):