aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 13:28:21 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 13:28:21 -0700
commit1c4b1bcd0384b655088028474bcbf13778f741c2 (patch)
tree3d31c2c6a0c1b3121f81960e9c8f193728cac478 /python
parent0c0585fc83bb155519c6e00c5c67920d2972116f (diff)
downloadsandcrawler-1c4b1bcd0384b655088028474bcbf13778f741c2.tar.gz
sandcrawler-1c4b1bcd0384b655088028474bcbf13778f741c2.zip
changes from prod
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/minio.py6
-rw-r--r--python/sandcrawler/persist.py16
2 files changed, 18 insertions, 4 deletions
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 4126b4b..8b02211 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -66,6 +66,12 @@ class SandcrawlerMinioClient(object):
content_type = "application/octet-stream"
if extension.endswith('.xml'):
content_type = "application/xml"
+ if extension.endswith('.png'):
+ content_type = "image/png"
+ elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
+ content_type = "image/jpeg"
+ elif extension.endswith('.txt'):
+ content_type = "text/plain"
self.mc.put_object(
bucket,
obj_path,
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 6d9298e..fbc5273 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -232,8 +232,6 @@ class PersistGrobidWorker(SandcrawlerWorker):
def __init__(self, db_url, **kwargs):
super().__init__()
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
self.grobid = GrobidClient()
self.s3 = SandcrawlerMinioClient(
host_url=kwargs.get('s3_url', 'localhost:9000'),
@@ -244,6 +242,12 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.s3_only = kwargs.get('s3_only', False)
self.db_only = kwargs.get('db_only', False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
def process(self, record, key=None):
"""
@@ -385,8 +389,6 @@ class PersistPdfTextWorker(SandcrawlerWorker):
def __init__(self, db_url, **kwargs):
super().__init__()
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
self.s3 = SandcrawlerMinioClient(
host_url=kwargs.get('s3_url', 'localhost:9000'),
access_key=kwargs['s3_access_key'],
@@ -396,6 +398,12 @@ class PersistPdfTextWorker(SandcrawlerWorker):
self.s3_only = kwargs.get('s3_only', False)
self.db_only = kwargs.get('db_only', False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
def process(self, record, key=None):
"""