persist grobid: add option to skip S3 upload

Motivation for this is that current S3 target (minio) is overloaded, with too many files on a single partition (80 million+). Going to look in to seaweedfs and other options, but for now stopping minio persist. Data is all stored in kafka anyways.
author: Bryan Newbold <bnewbold@archive.org> 2020-03-19 16:10:40 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-19 16:10:42 -0700
commit: 88f337f2cc40824ed3eaf32b1fec17c3b053bfdf (patch)
tree: ae7ae1a02906adf663098dc4e7762279d5ac2ac8 /python/sandcrawler/persist.py
parent: e21fac21cc5a4267357a499f75f048ee5fd38ddb (diff)
download: sandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.tar.gz
sandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.zip
1 files changed, 10 insertions, 7 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index f5de44a..379fd8b 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -240,6 +240,8 @@ class PersistGrobidWorker(SandcrawlerWorker):
             default_bucket=kwargs['s3_bucket'],
         )
         self.s3_only = kwargs.get('s3_only', False)
+        self.db_only = kwargs.get('db_only', False)
+        assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
 
     def process(self, record):
         """
@@ -264,13 +266,14 @@ class PersistGrobidWorker(SandcrawlerWorker):
                 continue
 
             assert len(r['key']) == 40
-            resp = self.s3.put_blob(
-                folder="grobid",
-                blob=r['tei_xml'],
-                sha1hex=r['key'],
-                extension=".tei.xml",
-            )
-            self.counts['s3-put'] += 1
+            if not self.db_only:
+                resp = self.s3.put_blob(
+                    folder="grobid",
+                    blob=r['tei_xml'],
+                    sha1hex=r['key'],
+                    extension=".tei.xml",
+                )
+                self.counts['s3-put'] += 1
 
             # enhance with teixml2json metadata, if available
             try:
author	Bryan Newbold <bnewbold@archive.org>	2020-03-19 16:10:40 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-19 16:10:42 -0700
commit	88f337f2cc40824ed3eaf32b1fec17c3b053bfdf (patch)
tree	ae7ae1a02906adf663098dc4e7762279d5ac2ac8 /python/sandcrawler/persist.py
parent	e21fac21cc5a4267357a499f75f048ee5fd38ddb (diff)
download	sandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.tar.gz sandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.zip