diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/minio.py | 6 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 16 |
2 files changed, 18 insertions, 4 deletions
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index 4126b4b..8b02211 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -66,6 +66,12 @@ class SandcrawlerMinioClient(object): content_type = "application/octet-stream" if extension.endswith('.xml'): content_type = "application/xml" + if extension.endswith('.png'): + content_type = "image/png" + elif extension.endswith('.jpg') or extension.endswith('.jpeg'): + content_type = "image/jpeg" + elif extension.endswith('.txt'): + content_type = "text/plain" self.mc.put_object( bucket, obj_path, diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 6d9298e..fbc5273 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -232,8 +232,6 @@ class PersistGrobidWorker(SandcrawlerWorker): def __init__(self, db_url, **kwargs): super().__init__() - self.db = SandcrawlerPostgresClient(db_url) - self.cur = self.db.conn.cursor() self.grobid = GrobidClient() self.s3 = SandcrawlerMinioClient( host_url=kwargs.get('s3_url', 'localhost:9000'), @@ -244,6 +242,12 @@ class PersistGrobidWorker(SandcrawlerWorker): self.s3_only = kwargs.get('s3_only', False) self.db_only = kwargs.get('db_only', False) assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" + if not self.s3_only: + self.db = SandcrawlerPostgresClient(db_url) + self.cur = self.db.conn.cursor() + else: + self.db = None + self.cur = None def process(self, record, key=None): """ @@ -385,8 +389,6 @@ class PersistPdfTextWorker(SandcrawlerWorker): def __init__(self, db_url, **kwargs): super().__init__() - self.db = SandcrawlerPostgresClient(db_url) - self.cur = self.db.conn.cursor() self.s3 = SandcrawlerMinioClient( host_url=kwargs.get('s3_url', 'localhost:9000'), access_key=kwargs['s3_access_key'], @@ -396,6 +398,12 @@ class PersistPdfTextWorker(SandcrawlerWorker): self.s3_only = kwargs.get('s3_only', False) self.db_only = kwargs.get('db_only', False) assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" + if not self.s3_only: + self.db = SandcrawlerPostgresClient(db_url) + self.cur = self.db.conn.cursor() + else: + self.db = None + self.cur = None def process(self, record, key=None): """ |