diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 11:23:14 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 11:23:14 -0800 | 
| commit | 9affaecc956854621362dfa70cc78b2602a01402 (patch) | |
| tree | a77119e9cf03d82bada23bde5730b0195344514c /python | |
| parent | a1b44161e206873be30c0640f5fab7a284023ba1 (diff) | |
| download | sandcrawler-9affaecc956854621362dfa70cc78b2602a01402.tar.gz sandcrawler-9affaecc956854621362dfa70cc78b2602a01402.zip | |
wayback: replay redirects have X-Archive-Redirect-Reason
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/ia.py | 6 | 
1 files changed, 4 insertions, 2 deletions
| diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 02258df..b5883e5 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -432,7 +432,8 @@ class WaybackClient:          (web.archive.org) instead of petabox.          Intended for use with SPN2 requests, where request body has not ended -        up in petabox yet. +        up in petabox yet. For example, re-ingesting a base_url which was +        recently crawler by SPNv2, where we are doing ingest via wayback path.          Returns None if response is found, but couldn't find redirect.          """ @@ -457,7 +458,8 @@ class WaybackClient:          #print(resp.url, file=sys.stderr)          # defensively check that this is actually correct replay based on headers -        assert "X-Archive-Src" in resp.headers +        if not "X-Archive-Redirect-Reason" in resp.headers: +            raise WaybackError("redirect replay fetch didn't return X-Archive-Redirect-Reason in headers")          if not datetime in resp.url:              raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url)) | 
