aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/minio.py6
-rw-r--r--python/sandcrawler/persist.py16
2 files changed, 18 insertions, 4 deletions
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 4126b4b..8b02211 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -66,6 +66,12 @@ class SandcrawlerMinioClient(object):
content_type = "application/octet-stream"
if extension.endswith('.xml'):
content_type = "application/xml"
+ if extension.endswith('.png'):
+ content_type = "image/png"
+ elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
+ content_type = "image/jpeg"
+ elif extension.endswith('.txt'):
+ content_type = "text/plain"
self.mc.put_object(
bucket,
obj_path,
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 6d9298e..fbc5273 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -232,8 +232,6 @@ class PersistGrobidWorker(SandcrawlerWorker):
def __init__(self, db_url, **kwargs):
super().__init__()
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
self.grobid = GrobidClient()
self.s3 = SandcrawlerMinioClient(
host_url=kwargs.get('s3_url', 'localhost:9000'),
@@ -244,6 +242,12 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.s3_only = kwargs.get('s3_only', False)
self.db_only = kwargs.get('db_only', False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
def process(self, record, key=None):
"""
@@ -385,8 +389,6 @@ class PersistPdfTextWorker(SandcrawlerWorker):
def __init__(self, db_url, **kwargs):
super().__init__()
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
self.s3 = SandcrawlerMinioClient(
host_url=kwargs.get('s3_url', 'localhost:9000'),
access_key=kwargs['s3_access_key'],
@@ -396,6 +398,12 @@ class PersistPdfTextWorker(SandcrawlerWorker):
self.s3_only = kwargs.get('s3_only', False)
self.db_only = kwargs.get('db_only', False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
def process(self, record, key=None):
"""