aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-03 09:57:12 -0800
committerBryan Newbold <bnewbold@archive.org>2020-03-03 09:57:12 -0800
commit46cd3516637fccd388bac6e0357d9ce7e3c7d8f1 (patch)
treed8c697851c199390a90218f055d3bc004594a878 /python
parent9911f2cad6f7470dbdb5af835ad61bcd4b7ad318 (diff)
downloadsandcrawler-46cd3516637fccd388bac6e0357d9ce7e3c7d8f1.tar.gz
sandcrawler-46cd3516637fccd388bac6e0357d9ce7e3c7d8f1.zip
make gzip content-encoding path more robust
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py11
1 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 529e663..e2ef47a 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -293,9 +293,18 @@ class IngestFileWorker(SandcrawlerWorker):
return result
file_meta = gen_file_metadata(resource.body)
+ # crude handling of content-encoding; wayback fetch library usually
+ # (and should always?) handle this
if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
- inner_body = gzip.decompress(resource.body)
+ try:
+ inner_body = gzip.decompress(resource.body)
+ except EOFError:
+ result['status'] = 'bad-gzip-encoding'
+ return result
+ if not inner_body:
+ result['status'] = 'null-body'
+ return result
resource = ResourceResult(
body=inner_body,
# copy all other fields