diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-03 09:57:12 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-03 09:57:12 -0800 |
commit | 46cd3516637fccd388bac6e0357d9ce7e3c7d8f1 (patch) | |
tree | d8c697851c199390a90218f055d3bc004594a878 | |
parent | 9911f2cad6f7470dbdb5af835ad61bcd4b7ad318 (diff) | |
download | sandcrawler-46cd3516637fccd388bac6e0357d9ce7e3c7d8f1.tar.gz sandcrawler-46cd3516637fccd388bac6e0357d9ce7e3c7d8f1.zip |
make gzip content-encoding path more robust
-rw-r--r-- | python/sandcrawler/ingest.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 529e663..e2ef47a 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -293,9 +293,18 @@ class IngestFileWorker(SandcrawlerWorker): return result file_meta = gen_file_metadata(resource.body) + # crude handling of content-encoding; wayback fetch library usually + # (and should always?) handle this if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) - inner_body = gzip.decompress(resource.body) + try: + inner_body = gzip.decompress(resource.body) + except EOFError: + result['status'] = 'bad-gzip-encoding' + return result + if not inner_body: + result['status'] = 'null-body' + return result resource = ResourceResult( body=inner_body, # copy all other fields |