diff options
-rw-r--r-- | python/sandcrawler/ingest.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 529e663..e2ef47a 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -293,9 +293,18 @@ class IngestFileWorker(SandcrawlerWorker): return result file_meta = gen_file_metadata(resource.body) + # crude handling of content-encoding; wayback fetch library usually + # (and should always?) handle this if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) - inner_body = gzip.decompress(resource.body) + try: + inner_body = gzip.decompress(resource.body) + except EOFError: + result['status'] = 'bad-gzip-encoding' + return result + if not inner_body: + result['status'] = 'null-body' + return result resource = ResourceResult( body=inner_body, # copy all other fields |