diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-19 16:10:40 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-19 16:10:42 -0700 |
commit | 88f337f2cc40824ed3eaf32b1fec17c3b053bfdf (patch) | |
tree | ae7ae1a02906adf663098dc4e7762279d5ac2ac8 | |
parent | e21fac21cc5a4267357a499f75f048ee5fd38ddb (diff) | |
download | sandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.tar.gz sandcrawler-88f337f2cc40824ed3eaf32b1fec17c3b053bfdf.zip |
persist grobid: add option to skip S3 upload
Motivation for this is that current S3 target (minio) is overloaded,
with too many files on a single partition (80 million+). Going to look
in to seaweedfs and other options, but for now stopping minio persist.
Data is all stored in kafka anyways.
-rwxr-xr-x | python/persist_tool.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 17 |
2 files changed, 14 insertions, 7 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py index 19e6dd7..869af06 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -41,6 +41,7 @@ def run_grobid(args): s3_access_key=args.s3_access_key, s3_secret_key=args.s3_secret_key, s3_only=args.s3_only, + db_only=args.db_only, ) pusher = JsonLinePusher( worker, @@ -135,6 +136,9 @@ def main(): sub_grobid.add_argument('--s3-only', action='store_true', help="only upload TEI-XML to S3 (don't write to database)") + sub_grobid.add_argument('--db-only', + action='store_true', + help="only write status to sandcrawler-db (don't save TEI-XML to S3)") sub_grobid_disk = subparsers.add_parser('grobid-disk', help="dump GRBOID output to (local) files on disk") diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index f5de44a..379fd8b 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -240,6 +240,8 @@ class PersistGrobidWorker(SandcrawlerWorker): default_bucket=kwargs['s3_bucket'], ) self.s3_only = kwargs.get('s3_only', False) + self.db_only = kwargs.get('db_only', False) + assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" def process(self, record): """ @@ -264,13 +266,14 @@ class PersistGrobidWorker(SandcrawlerWorker): continue assert len(r['key']) == 40 - resp = self.s3.put_blob( - folder="grobid", - blob=r['tei_xml'], - sha1hex=r['key'], - extension=".tei.xml", - ) - self.counts['s3-put'] += 1 + if not self.db_only: + resp = self.s3.put_blob( + folder="grobid", + blob=r['tei_xml'], + sha1hex=r['key'], + extension=".tei.xml", + ) + self.counts['s3-put'] += 1 # enhance with teixml2json metadata, if available try: |