From 46cd3516637fccd388bac6e0357d9ce7e3c7d8f1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Mar 2020 09:57:12 -0800 Subject: make gzip content-encoding path more robust --- python/sandcrawler/ingest.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 529e663..e2ef47a 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -293,9 +293,18 @@ class IngestFileWorker(SandcrawlerWorker): return result file_meta = gen_file_metadata(resource.body) + # crude handling of content-encoding; wayback fetch library usually + # (and should always?) handle this if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) - inner_body = gzip.decompress(resource.body) + try: + inner_body = gzip.decompress(resource.body) + except EOFError: + result['status'] = 'bad-gzip-encoding' + return result + if not inner_body: + result['status'] = 'null-body' + return result resource = ResourceResult( body=inner_body, # copy all other fields -- cgit v1.2.3