aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-27 17:41:45 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-27 17:51:23 -0700
commitf2177d5e30190dfc1e55f1b08fd21c2ce917ee86 (patch)
treecf326ea7006f423cdc5877bba93a90d57ab3345e
parent060f86888c8638e3b2be1bb005c29718842ab2e1 (diff)
downloadsandcrawler-f2177d5e30190dfc1e55f1b08fd21c2ce917ee86.tar.gz
sandcrawler-f2177d5e30190dfc1e55f1b08fd21c2ce917ee86.zip
timeout message implementation for GROBID and ingest workers
-rw-r--r--python/sandcrawler/grobid.py9
-rw-r--r--python/sandcrawler/ingest.py9
2 files changed, 18 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 08e3a96..f329a73 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -87,6 +87,15 @@ class GrobidWorker(SandcrawlerWorker):
self.sink = sink
self.consolidate_mode = 2
+ def timeout_response(self, task):
+ default_key = task['sha1hex']
+ return dict(
+ status="error-timeout",
+ error_msg="internal GROBID worker timeout",
+ source=task,
+ key=default_key,
+ )
+
def process(self, record):
default_key = record['sha1hex']
if record.get('warc_path') and record.get('warc_offset'):
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 5cb3ef8..0be7653 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -229,6 +229,15 @@ class IngestFileWorker(SandcrawlerWorker):
result.pop('key', None)
return result
+ def timeout_response(self, task):
+ print("[TIMEOUT]", file=sys.stderr)
+ return dict(
+ request=task,
+ hit=False,
+ status="timeout",
+ error_message="ingest worker internal timeout",
+ )
+
def process(self, request):
# backwards compatibility