aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-13 12:34:41 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-13 12:34:41 -0700
commit5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050 (patch)
tree01e6f53fef6ba82e6b99d1c241ba4b0a1e440adc
parentb376a034f0b5d5996a7e0aaeb0cb473fa5a1c427 (diff)
downloadsandcrawler-5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050.tar.gz
sandcrawler-5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050.zip
crawl: SPN self-redirect hack
-rw-r--r--python/sandcrawler/ia.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 9b2635b..166cf20 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1025,6 +1025,15 @@ class SavePageNowClient:
filter_status_code=filter_status_code,
retry_sleep=9.0,
)
+ # sometimes there are fuzzy http/https self-redirects with the
+ # same SURT; try to work around that
+ if cdx_row.status_code >= 300 and cdx_row.status_code < 400:
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=200,
+ retry_sleep=9.0,
+ )
except KeyError as ke:
print(" CDX KeyError: {}".format(ke), file=sys.stderr)
return ResourceResult(