diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-03 21:11:05 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-03 21:11:05 -0800 |
commit | b182e84b5f3e1ce5732bed657dc0d12fd3257537 (patch) | |
tree | 10fefa62534e5bff66b91dce0e863fc79d93f81c | |
parent | 720a45a1d9eea673e0f10d3a7dac0ca85fb913d3 (diff) | |
download | sandcrawler-b182e84b5f3e1ce5732bed657dc0d12fd3257537.tar.gz sandcrawler-b182e84b5f3e1ce5732bed657dc0d12fd3257537.zip |
ingest: make content-decoding more robust
-rw-r--r-- | python/sandcrawler/ingest.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index e2ef47a..7211ee0 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -299,8 +299,9 @@ class IngestFileWorker(SandcrawlerWorker): print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) try: inner_body = gzip.decompress(resource.body) - except EOFError: + except Exception as e: result['status'] = 'bad-gzip-encoding' + result['error_message'] = str(e) return result if not inner_body: result['status'] = 'null-body' |