diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 19:10:01 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 19:10:07 -0800 |
commit | 4c75d606b385feb29c37d48e0fcf077abf82f92f (patch) | |
tree | 2a3a20a6309dc472e598a36ba30df8d7552c1800 /python | |
parent | 3f8ead3d07bed78c750b9f6a8b7e95ebffeff089 (diff) | |
download | sandcrawler-4c75d606b385feb29c37d48e0fcf077abf82f92f.tar.gz sandcrawler-4c75d606b385feb29c37d48e0fcf077abf82f92f.zip |
attempt to work around corrupt ARC files from alexa issue
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e71f1e8..1d997f4 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -319,6 +319,11 @@ class WaybackClient: raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) except TypeError as te: raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + except Exception as e: + if "while decompressing data: invalid block type" in str(e): + raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files") + else: + raise e # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. |