aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-29 23:10:40 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-29 23:10:43 -0700
commitf6e6c09df6e1c005a9cdee738c991aa8bbe003b0 (patch)
tree84d7c97de2b48f92cde10eb29e285d2dc859389f /python
parentafa7aa6006a0de41de6cb08b61aaff61109e2792 (diff)
downloadsandcrawler-f6e6c09df6e1c005a9cdee738c991aa8bbe003b0.tar.gz
sandcrawler-f6e6c09df6e1c005a9cdee738c991aa8bbe003b0.zip
ia: set User-Agent for replay fetch from wayback
Did this for all the other "client" helpers, but forgot to for wayback replay. Was starting to get "445" errors from wayback.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a76f094..d6580e6 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -307,6 +307,9 @@ class WaybackClient:
self.rstore = None
self.max_redirects = 25
self.wayback_endpoint = "https://web.archive.org/web/"
+ self.replay_headers = {
+ 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',
+ }
def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True):
"""
@@ -464,6 +467,7 @@ class WaybackClient:
resp = requests.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
+ headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackError("redirect loop (wayback replay fetch)")
@@ -519,6 +523,7 @@ class WaybackClient:
resp = requests.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
+ headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackError("redirect loop (wayback replay fetch)")