aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-17 16:33:16 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-17 16:36:45 -0700
commitb23469d5b978b3b42a0aa55e7a191280fe1beccd (patch)
tree6b053293df5a60e3000ad975168a52804e66d7d1 /python/sandcrawler/ia.py
parent30ba490bb65d195b14f5b06aea2de5b4eb1d23d2 (diff)
downloadsandcrawler-b23469d5b978b3b42a0aa55e7a191280fe1beccd.tar.gz
sandcrawler-b23469d5b978b3b42a0aa55e7a191280fe1beccd.zip
work around local redirect (resource.location)
Some redirects are host-local. This patch crudely detects this (full-path redirects starting with "/" only), and appends the URL to the host of the original URL.
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py7
1 files changed, 6 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 99e92be..e31ff30 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -659,7 +659,12 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
- next_url = resource.location
+ if resource.location.startswith('/'):
+ # redirect location does not include hostname
+ domain_prefix = '/'.join(next_url.split('/')[:3])
+ next_url = domain_prefix + resource.location
+ else:
+ next_url = resource.location
else:
next_url = self.fetch_replay_redirect(
url=cdx_row.url,