From bdb4a63ae43c9c292611816a8f74fe7bd00e8a9c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 10 Jan 2020 17:06:41 -0800 Subject: fix redirect replay fetch method --- python/sandcrawler/ia.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 0ac1320..d91844b 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -413,10 +413,13 @@ class WaybackClient: raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url)) redirect_url = resp.headers.get("Location") + # eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw + #print(redirect_url, file=sys.stderr) if redirect_url and redirect_url.startswith("https://web.archive.org/web/"): redirect_url = "/".join(redirect_url.split("/")[5:]) + #print(redirect_url, file=sys.stderr) if redirect_url and redirect_url.startswith("http"): - return resp.url + return redirect_url else: return None -- cgit v1.2.3