diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-04-27 17:41:45 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-27 17:51:23 -0700 | 
| commit | f2177d5e30190dfc1e55f1b08fd21c2ce917ee86 (patch) | |
| tree | cf326ea7006f423cdc5877bba93a90d57ab3345e | |
| parent | 060f86888c8638e3b2be1bb005c29718842ab2e1 (diff) | |
| download | sandcrawler-f2177d5e30190dfc1e55f1b08fd21c2ce917ee86.tar.gz sandcrawler-f2177d5e30190dfc1e55f1b08fd21c2ce917ee86.zip  | |
timeout message implementation for GROBID and ingest workers
| -rw-r--r-- | python/sandcrawler/grobid.py | 9 | ||||
| -rw-r--r-- | python/sandcrawler/ingest.py | 9 | 
2 files changed, 18 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 08e3a96..f329a73 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -87,6 +87,15 @@ class GrobidWorker(SandcrawlerWorker):          self.sink = sink          self.consolidate_mode = 2 +    def timeout_response(self, task): +        default_key = task['sha1hex'] +        return dict( +            status="error-timeout", +            error_msg="internal GROBID worker timeout", +            source=task, +            key=default_key, +        ) +      def process(self, record):          default_key = record['sha1hex']          if record.get('warc_path') and record.get('warc_offset'): diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 5cb3ef8..0be7653 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -229,6 +229,15 @@ class IngestFileWorker(SandcrawlerWorker):          result.pop('key', None)          return result +    def timeout_response(self, task): +        print("[TIMEOUT]", file=sys.stderr) +        return dict( +            request=task, +            hit=False, +            status="timeout", +            error_message="ingest worker internal timeout", +        ) +      def process(self, request):          # backwards compatibility  | 
