aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-13 21:04:24 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-13 21:04:24 -0800
commit53b2d766c39ab832ef65295a1c60f2324b30a53c (patch)
tree06a7eaa41ed0d4a84cbe77d916a16af4f4a22317
parent5e169ce051884d2ebbdcbfde7cdc1d2b2efc4f74 (diff)
downloadsandcrawler-53b2d766c39ab832ef65295a1c60f2324b30a53c.tar.gz
sandcrawler-53b2d766c39ab832ef65295a1c60f2324b30a53c.zip
handle SPNv1 remote server HTTP status codes better
-rw-r--r--python/sandcrawler/ia.py23
1 files changed, 15 insertions, 8 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 39227ca..d1e376a 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -152,8 +152,8 @@ class SavePageNowClient:
self.ia_secret_key = os.environ.get('IA_SECRET_KEY')
self.v1endpoint = v1endpoint
self.v2endpoint = v2endpoint
- self.http_session = requests_retry_session(retries=5, backoff_factor=3)
- self.http_session.headers.update({
+ self.v1_session = requests_retry_session(retries=5, backoff_factor=3)
+ self.v1_session.headers.update({
'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient',
})
self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
@@ -168,14 +168,21 @@ class SavePageNowClient:
Returns a tuple (cdx, blob) on success of single fetch, or raises an
error on non-success.
"""
- resp = self.http_session.get(self.v1endpoint + url)
- if resp.status_code != 200 and not resp.headers.get('X-Archive-Orig-Location'):
- # looks like an error which was *not* a remote server error. Some
- # problem with wayback, might need to short-circuit
- raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
- if resp.headers.get('X-Archive-Wayback-Runtime-Error'):
+ try:
+ resp = self.v1_session.get(self.v1endpoint + url, status_forcelist=())
+ except requests.exceptions.RetryError as re:
+ # could have been any number of issues...
+ raise SavePageNowError(str(re))
+
+ if resp.status_code != 200 and resp.headers.get('X-Archive-Wayback-Runtime-Error'):
# looks like a weird remote error; would not expect a CDX reply so bailing here
raise SavePageNowRemoteError(resp.headers['X-Archive-Wayback-Runtime-Error'])
+ if resp.status_code != 200 and not resp.headers.get('X-Archive-Orig-Location'):
+ # looks like an error which was *not* just a remote server HTTP
+ # status code, or one of the handled wayback runtime errors. Some
+ # of these are remote server errors that wayback doesn't detect?
+ raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
+
terminal_url = '/'.join(resp.url.split('/')[5:])
body = resp.content
cdx = self.cdx_client.lookup_latest(terminal_url)