aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-19 17:00:29 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-19 17:00:29 -0700
commit41b3ae7f7f1d90a8e2aa141448cddd7b174e92fb (patch)
tree2660fb4395979c257d65c8750f098a7087c32ffc /python/sandcrawler/ia.py
parentfbbc76c8a7e523c029ff6f881b7ab4220131fd6c (diff)
downloadsandcrawler-41b3ae7f7f1d90a8e2aa141448cddd7b174e92fb.tar.gz
sandcrawler-41b3ae7f7f1d90a8e2aa141448cddd7b174e92fb.zip
CDX: when retrying, do so every 3 seconds up to limit
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py14
1 files changed, 9 insertions, 5 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 2d0d068..b4a8812 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -211,16 +211,20 @@ class CdxApiClient:
params['filter'] = "statuscode:{}".format(filter_status_code)
resp = self._query_api(params)
if not resp:
- if retry_sleep:
- print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ if retry_sleep and retry_sleep > 0:
+ next_sleep = None
+ if retry_sleep > 3:
+ next_sleep = retry_sleep - 3
+ retry_sleep = 3
+ print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
time.sleep(retry_sleep)
- return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
+ return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep)
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
row = resp[0]
# allow fuzzy http/https match
if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
- if retry_sleep:
- print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ if retry_sleep and retry_sleep > 0:
+ print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
time.sleep(retry_sleep)
return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row))