aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-01-13 15:38:26 -0800
committerBryan Newbold <bnewbold@archive.org>2022-01-13 15:38:26 -0800
commit97339426c4d0022c3fdf5948ef94b99bb1e120ee (patch)
tree53aa78c92f9dba9899ef37a1375433833e4646e5 /python/sandcrawler
parentff6894043576d3d51c9ab16623053f91780edc89 (diff)
downloadsandcrawler-97339426c4d0022c3fdf5948ef94b99bb1e120ee.tar.gz
sandcrawler-97339426c4d0022c3fdf5948ef94b99bb1e120ee.zip
null-body -> empty-blob
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/fileset_strategies.py4
-rw-r--r--python/sandcrawler/ingest_file.py4
-rw-r--r--python/sandcrawler/ingest_fileset.py4
3 files changed, 8 insertions, 4 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 9696f3c..c3374f8 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -317,6 +317,10 @@ class WebFilesetStrategy(FilesetIngestStrategy):
else:
assert resource.terminal_status_code == 200
+ if not resource.body:
+ m.status = "empty-blob"
+ continue
+
file_meta = gen_file_metadata(resource.body)
file_meta, html_resource = fix_transfer_encoding(file_meta, resource)
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index d0c3e0e..857a212 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -683,7 +683,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not resource.body:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
if len(resource.body) > MAX_BODY_SIZE_BYTES:
@@ -699,7 +699,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if not resource.body or file_meta["size_bytes"] == 0:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
# here we split based on ingest type to try and extract a next hop
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 542dfbc..7337043 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -180,7 +180,7 @@ class IngestFilesetWorker(IngestFileWorker):
return result
if not resource.body:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
if len(resource.body) > MAX_BODY_SIZE_BYTES:
@@ -196,7 +196,7 @@ class IngestFilesetWorker(IngestFileWorker):
return result
if not resource.body or file_meta["size_bytes"] == 0:
- result["status"] = "null-body"
+ result["status"] = "empty-blob"
return result
# here we split based on ingest type to try and extract a next hop