aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-15 21:25:57 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-15 21:25:59 -0800
commite83850ec7bd113e6cfb4af97df37934cb23ef265 (patch)
tree383ad984d750d1f128e5242bfd06597a0cccee7f
parentf4862bd582577749c7d71979e3e56650a4a58200 (diff)
downloadsandcrawler-e83850ec7bd113e6cfb4af97df37934cb23ef265.tar.gz
sandcrawler-e83850ec7bd113e6cfb4af97df37934cb23ef265.zip
wayback replay: catch UnicodeDecodeError
In prod, ran in to a redirect URL like: b'/web/20200116043630id_/https://mediarep.org/bitstream/handle/doc/1127/Barth\xe9l\xe9my_2015_Life_and_Technology.pdf;jsessionid=A9EFB2798846F5E14A8473BBFD6AB46C?sequence=1' which broke requests.
-rw-r--r--python/sandcrawler/ia.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 29991df..058f4ca 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -397,6 +397,8 @@ class WaybackClient:
)
except requests.exceptions.TooManyRedirects:
raise WaybackError("redirect loop (wayback replay fetch)")
+ except UnicodeDecodeError:
+ raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
try:
resp.raise_for_status()
except Exception as e: