aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-03 21:11:05 -0800
committerBryan Newbold <bnewbold@archive.org>2020-03-03 21:11:05 -0800
commitb182e84b5f3e1ce5732bed657dc0d12fd3257537 (patch)
tree10fefa62534e5bff66b91dce0e863fc79d93f81c
parent720a45a1d9eea673e0f10d3a7dac0ca85fb913d3 (diff)
downloadsandcrawler-b182e84b5f3e1ce5732bed657dc0d12fd3257537.tar.gz
sandcrawler-b182e84b5f3e1ce5732bed657dc0d12fd3257537.zip
ingest: make content-decoding more robust
-rw-r--r--python/sandcrawler/ingest.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index e2ef47a..7211ee0 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -299,8 +299,9 @@ class IngestFileWorker(SandcrawlerWorker):
print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
try:
inner_body = gzip.decompress(resource.body)
- except EOFError:
+ except Exception as e:
result['status'] = 'bad-gzip-encoding'
+ result['error_message'] = str(e)
return result
if not inner_body:
result['status'] = 'null-body'