diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-05-16 15:02:02 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-05-16 15:02:02 -0700 |
commit | fcc5a1648d2e49e7002ca569ed668d3318a75584 (patch) | |
tree | 6ab5e044625b9d7de9a3596b1c212c259e68669e /python/sandcrawler/ia.py | |
parent | 1534ff4d05c6fca460e82b5707fe3fbdc3504e50 (diff) | |
download | sandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.tar.gz sandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.zip |
ingest: catch more ConnectionErrors (SPN, replay fetch, GROBID)
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r-- | python/sandcrawler/ia.py | 14 |
1 files changed, 10 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 7365383..6003f02 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -611,6 +611,8 @@ class WaybackClient: ) except requests.exceptions.TooManyRedirects: raise WaybackContentError("redirect loop (wayback replay fetch)") + except requests.exceptions.ConnectionError: + raise WaybackContentError("ConnectionError (wayback replay fetch)") except requests.exceptions.ChunkedEncodingError: raise WaybackError("ChunkedEncodingError (wayback replay fetch)") except UnicodeDecodeError: @@ -1042,10 +1044,14 @@ class SavePageNowClient: req_data["force_get"] = force_simple_get if capture_outlinks: req_data["capture_outlinks"] = capture_outlinks - resp = self.v2_session.post( - self.v2endpoint, - data=req_data, - ) + try: + resp = self.v2_session.post( + self.v2endpoint, + data=req_data, + ) + except requests.exceptions.ConnectionError: + raise SavePageNowError(f"SPN2 TCP connection error {request_url=}") + if resp.status_code == 429: raise SavePageNowBackoffError( "status_code: {}, url: {}".format(resp.status_code, request_url) |