From fcc5a1648d2e49e7002ca569ed668d3318a75584 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 16 May 2022 15:02:02 -0700 Subject: ingest: catch more ConnectionErrors (SPN, replay fetch, GROBID) --- python/sandcrawler/grobid.py | 9 +++++++++ python/sandcrawler/ia.py | 14 ++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 1f957da..aa2c112 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -120,6 +120,15 @@ class GrobidClient(object): "status_code": -4, # heritrix3 "HTTP timeout" code "error_msg": "GROBID request (HTTP POST) timeout", } + except requests.exceptions.ConnectionError as ce: + # intentionally raising this, so workers crash when GROBID + # unavailable. but do add a sleep to slow things down. + print( + "GROBID ConnectionError. sleeping as a slow-down before crashing", + file=sys.stderr, + ) + time.sleep(5.0) + raise ce info: Dict[str, Any] = dict(status_code=grobid_response.status_code) if grobid_response.status_code == 200: diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 7365383..6003f02 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -611,6 +611,8 @@ class WaybackClient: ) except requests.exceptions.TooManyRedirects: raise WaybackContentError("redirect loop (wayback replay fetch)") + except requests.exceptions.ConnectionError: + raise WaybackContentError("ConnectionError (wayback replay fetch)") except requests.exceptions.ChunkedEncodingError: raise WaybackError("ChunkedEncodingError (wayback replay fetch)") except UnicodeDecodeError: @@ -1042,10 +1044,14 @@ class SavePageNowClient: req_data["force_get"] = force_simple_get if capture_outlinks: req_data["capture_outlinks"] = capture_outlinks - resp = self.v2_session.post( - self.v2endpoint, - data=req_data, - ) + try: + resp = self.v2_session.post( + self.v2endpoint, + data=req_data, + ) + except requests.exceptions.ConnectionError: + raise SavePageNowError(f"SPN2 TCP connection error {request_url=}") + if resp.status_code == 429: raise SavePageNowBackoffError( "status_code: {}, url: {}".format(resp.status_code, request_url) -- cgit v1.2.3