aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-19 16:10:40 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-19 16:10:42 -0700
commit88f337f2cc40824ed3eaf32b1fec17c3b053bfdf (patch)
treeae7ae1a02906adf663098dc4e7762279d5ac2ac8 /python
parente21fac21cc5a4267357a499f75f048ee5fd38ddb (diff)
downloadsandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.tar.gz
sandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.zip
persist grobid: add option to skip S3 upload
Motivation for this is that current S3 target (minio) is overloaded, with too many files on a single partition (80 million+). Going to look in to seaweedfs and other options, but for now stopping minio persist. Data is all stored in kafka anyways.
Diffstat (limited to 'python')
-rwxr-xr-xpython/persist_tool.py4
-rw-r--r--python/sandcrawler/persist.py17
2 files changed, 14 insertions, 7 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 19e6dd7..869af06 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -41,6 +41,7 @@ def run_grobid(args):
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
s3_only=args.s3_only,
+ db_only=args.db_only,
)
pusher = JsonLinePusher(
worker,
@@ -135,6 +136,9 @@ def main():
sub_grobid.add_argument('--s3-only',
action='store_true',
help="only upload TEI-XML to S3 (don't write to database)")
+ sub_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
sub_grobid_disk = subparsers.add_parser('grobid-disk',
help="dump GRBOID output to (local) files on disk")
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index f5de44a..379fd8b 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -240,6 +240,8 @@ class PersistGrobidWorker(SandcrawlerWorker):
default_bucket=kwargs['s3_bucket'],
)
self.s3_only = kwargs.get('s3_only', False)
+ self.db_only = kwargs.get('db_only', False)
+ assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
def process(self, record):
"""
@@ -264,13 +266,14 @@ class PersistGrobidWorker(SandcrawlerWorker):
continue
assert len(r['key']) == 40
- resp = self.s3.put_blob(
- folder="grobid",
- blob=r['tei_xml'],
- sha1hex=r['key'],
- extension=".tei.xml",
- )
- self.counts['s3-put'] += 1
+ if not self.db_only:
+ resp = self.s3.put_blob(
+ folder="grobid",
+ blob=r['tei_xml'],
+ sha1hex=r['key'],
+ extension=".tei.xml",
+ )
+ self.counts['s3-put'] += 1
# enhance with teixml2json metadata, if available
try: