diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 13:19:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 13:19:22 -0700 |
commit | 800860ecd25346ff4a638e9d42fa905396b8fa1b (patch) | |
tree | 736e032c1d412d2cfcc0b9807e2ce55bde35ba8c /python/sandcrawler/workers.py | |
parent | c216a9cdecd85db8296e6499593244686c430d8c (diff) | |
download | sandcrawler-800860ecd25346ff4a638e9d42fa905396b8fa1b.tar.gz sandcrawler-800860ecd25346ff4a638e9d42fa905396b8fa1b.zip |
customize timeout per worker; 120sec for pdf-extract
This is a stab-in-the-dark attempt to resolve long timeouts with this
worker in prod.
Diffstat (limited to 'python/sandcrawler/workers.py')
-rw-r--r-- | python/sandcrawler/workers.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index f6693bb..814cbf3 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -488,6 +488,7 @@ class KafkaJsonPusher(RecordPusher): if self.batch_size in (0, 1): self.batch_size = 1 self.batch_worker = kwargs.get('batch_worker', False) + self.process_timeout_sec = kwargs.get('process_timeout_sec', 300) def run(self): while True: @@ -539,7 +540,7 @@ class KafkaJsonPusher(RecordPusher): while not done: try: # use timeouts; don't want kafka itself to timeout - self.worker.push_record_timeout(record, key=msg.key(), timeout=300) + self.worker.push_record_timeout(record, key=msg.key(), timeout=self.process_timeout_sec) break except SandcrawlerBackoffError as be: print("Backing off for 200 seconds: {}".format(be)) |