aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index e71f1e8..1d997f4 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -319,6 +319,11 @@ class WaybackClient:
raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
except TypeError as te:
raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ except Exception as e:
+ if "while decompressing data: invalid block type" in str(e):
+ raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files")
+ else:
+ raise e
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.