aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-10 17:06:41 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-10 17:06:41 -0800
commitbdb4a63ae43c9c292611816a8f74fe7bd00e8a9c (patch)
treed77de7b17b62bae099d5375abbe41fe2e16453d3
parenta54050c46654b0a6782abe5c0462c6f1c628ef2d (diff)
downloadsandcrawler-bdb4a63ae43c9c292611816a8f74fe7bd00e8a9c.tar.gz
sandcrawler-bdb4a63ae43c9c292611816a8f74fe7bd00e8a9c.zip
fix redirect replay fetch method
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 0ac1320..d91844b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -413,10 +413,13 @@ class WaybackClient:
raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
redirect_url = resp.headers.get("Location")
+ # eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
+ #print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
redirect_url = "/".join(redirect_url.split("/")[5:])
+ #print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("http"):
- return resp.url
+ return redirect_url
else:
return None