From f6e6c09df6e1c005a9cdee738c991aa8bbe003b0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 29 Mar 2020 23:10:40 -0700 Subject: ia: set User-Agent for replay fetch from wayback Did this for all the other "client" helpers, but forgot to for wayback replay. Was starting to get "445" errors from wayback. --- python/sandcrawler/ia.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a76f094..d6580e6 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -307,6 +307,9 @@ class WaybackClient: self.rstore = None self.max_redirects = 25 self.wayback_endpoint = "https://web.archive.org/web/" + self.replay_headers = { + 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient', + } def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True): """ @@ -464,6 +467,7 @@ class WaybackClient: resp = requests.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, + headers=self.replay_headers, ) except requests.exceptions.TooManyRedirects: raise WaybackError("redirect loop (wayback replay fetch)") @@ -519,6 +523,7 @@ class WaybackClient: resp = requests.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, + headers=self.replay_headers, ) except requests.exceptions.TooManyRedirects: raise WaybackError("redirect loop (wayback replay fetch)") -- cgit v1.2.3