aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-05-16 15:02:02 -0700
committerBryan Newbold <bnewbold@archive.org>2022-05-16 15:02:02 -0700
commitfcc5a1648d2e49e7002ca569ed668d3318a75584 (patch)
tree6ab5e044625b9d7de9a3596b1c212c259e68669e
parent1534ff4d05c6fca460e82b5707fe3fbdc3504e50 (diff)
downloadsandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.tar.gz
sandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.zip
ingest: catch more ConnectionErrors (SPN, replay fetch, GROBID)
-rw-r--r--python/sandcrawler/grobid.py9
-rw-r--r--python/sandcrawler/ia.py14
2 files changed, 19 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 1f957da..aa2c112 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -120,6 +120,15 @@ class GrobidClient(object):
"status_code": -4, # heritrix3 "HTTP timeout" code
"error_msg": "GROBID request (HTTP POST) timeout",
}
+ except requests.exceptions.ConnectionError as ce:
+ # intentionally raising this, so workers crash when GROBID
+ # unavailable. but do add a sleep to slow things down.
+ print(
+ "GROBID ConnectionError. sleeping as a slow-down before crashing",
+ file=sys.stderr,
+ )
+ time.sleep(5.0)
+ raise ce
info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200:
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 7365383..6003f02 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -611,6 +611,8 @@ class WaybackClient:
)
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ConnectionError:
+ raise WaybackContentError("ConnectionError (wayback replay fetch)")
except requests.exceptions.ChunkedEncodingError:
raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
except UnicodeDecodeError:
@@ -1042,10 +1044,14 @@ class SavePageNowClient:
req_data["force_get"] = force_simple_get
if capture_outlinks:
req_data["capture_outlinks"] = capture_outlinks
- resp = self.v2_session.post(
- self.v2endpoint,
- data=req_data,
- )
+ try:
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data=req_data,
+ )
+ except requests.exceptions.ConnectionError:
+ raise SavePageNowError(f"SPN2 TCP connection error {request_url=}")
+
if resp.status_code == 429:
raise SavePageNowBackoffError(
"status_code: {}, url: {}".format(resp.status_code, request_url)