diff options
-rw-r--r-- | python/sandcrawler/ia.py | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 99e92be..e31ff30 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -659,7 +659,12 @@ class WaybackClient: cdx=cdx_row, revisit_cdx=None, ) - next_url = resource.location + if resource.location.startswith('/'): + # redirect location does not include hostname + domain_prefix = '/'.join(next_url.split('/')[:3]) + next_url = domain_prefix + resource.location + else: + next_url = resource.location else: next_url = self.fetch_replay_redirect( url=cdx_row.url, |