From e83850ec7bd113e6cfb4af97df37934cb23ef265 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Jan 2020 21:25:57 -0800 Subject: wayback replay: catch UnicodeDecodeError In prod, ran in to a redirect URL like: b'/web/20200116043630id_/https://mediarep.org/bitstream/handle/doc/1127/Barth\xe9l\xe9my_2015_Life_and_Technology.pdf;jsessionid=A9EFB2798846F5E14A8473BBFD6AB46C?sequence=1' which broke requests. --- python/sandcrawler/ia.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 29991df..058f4ca 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -397,6 +397,8 @@ class WaybackClient: ) except requests.exceptions.TooManyRedirects: raise WaybackError("redirect loop (wayback replay fetch)") + except UnicodeDecodeError: + raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url)) try: resp.raise_for_status() except Exception as e: -- cgit v1.2.3