From 4c75d606b385feb29c37d48e0fcf077abf82f92f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 18 Feb 2020 19:10:01 -0800 Subject: attempt to work around corrupt ARC files from alexa issue --- python/sandcrawler/ia.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e71f1e8..1d997f4 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -319,6 +319,11 @@ class WaybackClient: raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) except TypeError as te: raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + except Exception as e: + if "while decompressing data: invalid block type" in str(e): + raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files") + else: + raise e # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. -- cgit v1.2.3