From 97339426c4d0022c3fdf5948ef94b99bb1e120ee Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 13 Jan 2022 15:38:26 -0800 Subject: null-body -> empty-blob --- python/sandcrawler/fileset_strategies.py | 4 ++++ python/sandcrawler/ingest_file.py | 4 ++-- python/sandcrawler/ingest_fileset.py | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 9696f3c..c3374f8 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -317,6 +317,10 @@ class WebFilesetStrategy(FilesetIngestStrategy): else: assert resource.terminal_status_code == 200 + if not resource.body: + m.status = "empty-blob" + continue + file_meta = gen_file_metadata(resource.body) file_meta, html_resource = fix_transfer_encoding(file_meta, resource) diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index d0c3e0e..857a212 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -683,7 +683,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -699,7 +699,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 542dfbc..7337043 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -180,7 +180,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -196,7 +196,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop -- cgit v1.2.3