diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-17 16:33:16 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-17 16:36:45 -0700 |
commit | b23469d5b978b3b42a0aa55e7a191280fe1beccd (patch) | |
tree | 6b053293df5a60e3000ad975168a52804e66d7d1 /python | |
parent | 30ba490bb65d195b14f5b06aea2de5b4eb1d23d2 (diff) | |
download | sandcrawler-b23469d5b978b3b42a0aa55e7a191280fe1beccd.tar.gz sandcrawler-b23469d5b978b3b42a0aa55e7a191280fe1beccd.zip |
work around local redirect (resource.location)
Some redirects are host-local. This patch crudely detects this
(full-path redirects starting with "/" only), and appends the URL to the
host of the original URL.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 99e92be..e31ff30 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -659,7 +659,12 @@ class WaybackClient: cdx=cdx_row, revisit_cdx=None, ) - next_url = resource.location + if resource.location.startswith('/'): + # redirect location does not include hostname + domain_prefix = '/'.join(next_url.split('/')[:3]) + next_url = domain_prefix + resource.location + else: + next_url = resource.location else: next_url = self.fetch_replay_redirect( url=cdx_row.url, |