aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-22 23:08:15 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-22 23:08:15 -0700
commit031f51752e79dbdde47bbc95fe6b3600c9ec711a (patch)
treed391be5b2796f9e0d2c8d843d79a58796d58c7ac /python
parent4a9fba8005e0a65c03198c674d2c65f7440d71a6 (diff)
downloadsandcrawler-031f51752e79dbdde47bbc95fe6b3600c9ec711a.tar.gz
sandcrawler-031f51752e79dbdde47bbc95fe6b3600c9ec711a.zip
reimplement worker timeout with multiprocessing
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/workers.py40
1 files changed, 17 insertions, 23 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 37e3d7a..b6cec29 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -2,7 +2,6 @@
import sys
import json
import time
-import signal
import zipfile
import requests
import multiprocessing.pool
@@ -57,30 +56,25 @@ class SandcrawlerWorker(object):
"""
A wrapper around self.push_record which sets a timeout.
- Note that this uses signals and *will behave wrong/weirdly* with
- multithreading or if signal-based timeouts are used elsewhere in the
- same process.
+ Current implementation uses multiprocessing instead of signal.
"""
- def timeout_handler(signum, frame):
- raise TimeoutError("timeout processing record")
- signal.signal(signal.SIGALRM, timeout_handler)
- resp = None
- signal.alarm(int(timeout))
- try:
- resp = self.push_record(task, key=key)
- except TimeoutError:
- self.counts['timeout'] += 1
- resp = self.timeout_response(task) # pylint: disable=assignment-from-none
- # TODO: what if it is this push_record() itself that is timing out?
- if resp and self.sink:
- self.sink.push_record(resp)
- self.counts['pushed'] += 1
- elif resp:
- print(json.dumps(resp))
- finally:
- signal.alarm(0)
- return resp
+ with multiprocessing.pool.Pool(1) as p:
+ proc = p.apply_async(self._push_record_helper, (task, key))
+ try:
+ resp = proc.get(timeout=timeout)
+ except TimeoutError:
+ self.counts['timeout'] += 1
+ resp = self.timeout_response(task) # pylint: disable=assignment-from-none
+ # TODO: what if it is this push_record() itself that is timing out?
+ if resp and self.sink:
+ self.sink.push_record(resp)
+ self.counts['pushed'] += 1
+ elif resp:
+ print(json.dumps(resp))
+ raise TimeoutError("timeout processing record")
+ else:
+ return resp
def push_batch(self, tasks):
results = []