From 19962108e0fd23fe6af24f170da8c47149e531dc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 11:00:11 -0800 Subject: ingest: fix null-body case Broke this in earlier refactor. --- python/sandcrawler/ia.py | 2 ++ python/sandcrawler/ingest.py | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 0b58f3b..da667b6 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1076,6 +1076,8 @@ def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[di if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) inner_body = gzip.decompress(resource.body) + if not inner_body: + raise Exception("null body inside transfer encoding") inner_resource = ResourceResult( body=inner_body, # copy all other fields diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 0c8eee6..2f9c523 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -531,6 +531,10 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = 'blocked-cookie' return result + if not resource.body: + result['status'] = 'null-body' + return result + file_meta = gen_file_metadata(resource.body) try: file_meta, resource = fix_transfer_encoding(file_meta, resource) -- cgit v1.2.3