diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-13 12:34:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-13 12:34:41 -0700 |
commit | 5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050 (patch) | |
tree | 01e6f53fef6ba82e6b99d1c241ba4b0a1e440adc | |
parent | b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427 (diff) | |
download | sandcrawler-5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050.tar.gz sandcrawler-5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050.zip |
crawl: SPN self-redirect hack
-rw-r--r-- | python/sandcrawler/ia.py | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 9b2635b..166cf20 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1025,6 +1025,15 @@ class SavePageNowClient: filter_status_code=filter_status_code, retry_sleep=9.0, ) + # sometimes there are fuzzy http/https self-redirects with the + # same SURT; try to work around that + if cdx_row.status_code >= 300 and cdx_row.status_code < 400: + cdx_row = wayback_client.cdx_client.fetch( + url=spn_result.terminal_url, + datetime=spn_result.terminal_dt, + filter_status_code=200, + retry_sleep=9.0, + ) except KeyError as ke: print(" CDX KeyError: {}".format(ke), file=sys.stderr) return ResourceResult( |