aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-18 19:10:01 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-18 19:10:07 -0800
commit4c75d606b385feb29c37d48e0fcf077abf82f92f (patch)
tree2a3a20a6309dc472e598a36ba30df8d7552c1800 /python
parent3f8ead3d07bed78c750b9f6a8b7e95ebffeff089 (diff)
downloadsandcrawler-4c75d606b385feb29c37d48e0fcf077abf82f92f.tar.gz
sandcrawler-4c75d606b385feb29c37d48e0fcf077abf82f92f.zip
attempt to work around corrupt ARC files from alexa issue
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index e71f1e8..1d997f4 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -319,6 +319,11 @@ class WaybackClient:
raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
except TypeError as te:
raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ except Exception as e:
+ if "while decompressing data: invalid block type" in str(e):
+ raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files")
+ else:
+ raise e
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.