diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-29 23:10:40 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-29 23:10:43 -0700 |
commit | f6e6c09df6e1c005a9cdee738c991aa8bbe003b0 (patch) | |
tree | 84d7c97de2b48f92cde10eb29e285d2dc859389f /python | |
parent | afa7aa6006a0de41de6cb08b61aaff61109e2792 (diff) | |
download | sandcrawler-f6e6c09df6e1c005a9cdee738c991aa8bbe003b0.tar.gz sandcrawler-f6e6c09df6e1c005a9cdee738c991aa8bbe003b0.zip |
ia: set User-Agent for replay fetch from wayback
Did this for all the other "client" helpers, but forgot to for wayback
replay.
Was starting to get "445" errors from wayback.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a76f094..d6580e6 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -307,6 +307,9 @@ class WaybackClient: self.rstore = None self.max_redirects = 25 self.wayback_endpoint = "https://web.archive.org/web/" + self.replay_headers = { + 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient', + } def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True): """ @@ -464,6 +467,7 @@ class WaybackClient: resp = requests.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, + headers=self.replay_headers, ) except requests.exceptions.TooManyRedirects: raise WaybackError("redirect loop (wayback replay fetch)") @@ -519,6 +523,7 @@ class WaybackClient: resp = requests.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, + headers=self.replay_headers, ) except requests.exceptions.TooManyRedirects: raise WaybackError("redirect loop (wayback replay fetch)") |