aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-29 23:10:40 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-29 23:10:43 -0700
commitf6e6c09df6e1c005a9cdee738c991aa8bbe003b0 (patch)
tree84d7c97de2b48f92cde10eb29e285d2dc859389f
parentafa7aa6006a0de41de6cb08b61aaff61109e2792 (diff)
downloadsandcrawler-f6e6c09df6e1c005a9cdee738c991aa8bbe003b0.tar.gz
sandcrawler-f6e6c09df6e1c005a9cdee738c991aa8bbe003b0.zip
ia: set User-Agent for replay fetch from wayback
Did this for all the other "client" helpers, but forgot to for wayback replay. Was starting to get "445" errors from wayback.
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a76f094..d6580e6 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -307,6 +307,9 @@ class WaybackClient:
self.rstore = None
self.max_redirects = 25
self.wayback_endpoint = "https://web.archive.org/web/"
+ self.replay_headers = {
+ 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',
+ }
def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True):
"""
@@ -464,6 +467,7 @@ class WaybackClient:
resp = requests.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
+ headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackError("redirect loop (wayback replay fetch)")
@@ -519,6 +523,7 @@ class WaybackClient:
resp = requests.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
+ headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackError("redirect loop (wayback replay fetch)")