diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 4 |
3 files changed, 8 insertions, 4 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 9696f3c..c3374f8 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -317,6 +317,10 @@ class WebFilesetStrategy(FilesetIngestStrategy): else: assert resource.terminal_status_code == 200 + if not resource.body: + m.status = "empty-blob" + continue + file_meta = gen_file_metadata(resource.body) file_meta, html_resource = fix_transfer_encoding(file_meta, resource) diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index d0c3e0e..857a212 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -683,7 +683,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -699,7 +699,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 542dfbc..7337043 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -180,7 +180,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -196,7 +196,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop |