From b23469d5b978b3b42a0aa55e7a191280fe1beccd Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 17 Mar 2020 16:33:16 -0700 Subject: work around local redirect (resource.location) Some redirects are host-local. This patch crudely detects this (full-path redirects starting with "/" only), and appends the URL to the host of the original URL. --- python/sandcrawler/ia.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 99e92be..e31ff30 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -659,7 +659,12 @@ class WaybackClient: cdx=cdx_row, revisit_cdx=None, ) - next_url = resource.location + if resource.location.startswith('/'): + # redirect location does not include hostname + domain_prefix = '/'.join(next_url.split('/')[:3]) + next_url = domain_prefix + resource.location + else: + next_url = resource.location else: next_url = self.fetch_replay_redirect( url=cdx_row.url, -- cgit v1.2.3