diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-14 11:03:20 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-14 11:03:20 -0800 |
commit | 7d754db1b5809d0cb4c18c4432b4347914757196 (patch) | |
tree | d3434d687f5c429657a3e77a879a5cf80e8225ec /python | |
parent | 8774b17dbb9c0be8ca44846188f77403fae3e867 (diff) | |
download | sandcrawler-7d754db1b5809d0cb4c18c4432b4347914757196.tar.gz sandcrawler-7d754db1b5809d0cb4c18c4432b4347914757196.zip |
handle SPNv2 polling timeout
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 31ea84e..08b92e2 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -192,7 +192,7 @@ class SavePageNowClient: def save_url_now_v2(self, url): """ - Returns a list of cdx objects, or raises an error on non-success. + Returns a list of URLs, or raises an error on non-success. """ if not (self.ia_access_key and self.ia_secret_key): raise Exception("SPNv2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)") @@ -210,20 +210,24 @@ class SavePageNowClient: assert resp_json # poll until complete + final_json = None for i in range(90): resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, resp_json['job_id'])) resp.raise_for_status() status = resp.json()['status'] if status == 'success': - resp = resp.json() - if resp.get('message', '').startswith('The same snapshot had been made'): - raise SavePageNowError("SPN2 re-snapshot withing short time window") + final_json = resp.json() + if final_json.get('message', '').startswith('The same snapshot had been made'): + raise SavePageNowError("SPN2 re-snapshot within short time window") break elif status == 'pending': time.sleep(1.0) else: raise SavePageNowError("SPN2 status:{} url:{}".format(status, url)) - #print(resp) - return resp['resources'] + if not final_json: + raise SavePageNowError("SPN2 timed out (polling count exceeded)") + + #print(final_json) + return final_json['resources'] |