diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 17:00:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 17:00:29 -0700 |
commit | 41b3ae7f7f1d90a8e2aa141448cddd7b174e92fb (patch) | |
tree | 2660fb4395979c257d65c8750f098a7087c32ffc /python | |
parent | fbbc76c8a7e523c029ff6f881b7ab4220131fd6c (diff) | |
download | sandcrawler-41b3ae7f7f1d90a8e2aa141448cddd7b174e92fb.tar.gz sandcrawler-41b3ae7f7f1d90a8e2aa141448cddd7b174e92fb.zip |
CDX: when retrying, do so every 3 seconds up to limit
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 2d0d068..b4a8812 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -211,16 +211,20 @@ class CdxApiClient: params['filter'] = "statuscode:{}".format(filter_status_code) resp = self._query_api(params) if not resp: - if retry_sleep: - print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr) + if retry_sleep and retry_sleep > 0: + next_sleep = None + if retry_sleep > 3: + next_sleep = retry_sleep - 3 + retry_sleep = 3 + print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr) time.sleep(retry_sleep) - return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None) + return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep) raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime)) row = resp[0] # allow fuzzy http/https match if not (fuzzy_match_url(row.url, url) and row.datetime == datetime): - if retry_sleep: - print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr) + if retry_sleep and retry_sleep > 0: + print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr) time.sleep(retry_sleep) return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None) raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row)) |