aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py2
-rw-r--r--python/sandcrawler/ingest.py4
2 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 0b58f3b..da667b6 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1076,6 +1076,8 @@ def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[di
if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
inner_body = gzip.decompress(resource.body)
+ if not inner_body:
+ raise Exception("null body inside transfer encoding")
inner_resource = ResourceResult(
body=inner_body,
# copy all other fields
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 0c8eee6..2f9c523 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -531,6 +531,10 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = 'blocked-cookie'
return result
+ if not resource.body:
+ result['status'] = 'null-body'
+ return result
+
file_meta = gen_file_metadata(resource.body)
try:
file_meta, resource = fix_transfer_encoding(file_meta, resource)