diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-01-13 15:38:26 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-01-13 15:38:26 -0800 |
commit | 97339426c4d0022c3fdf5948ef94b99bb1e120ee (patch) | |
tree | 53aa78c92f9dba9899ef37a1375433833e4646e5 | |
parent | ff6894043576d3d51c9ab16623053f91780edc89 (diff) | |
download | sandcrawler-97339426c4d0022c3fdf5948ef94b99bb1e120ee.tar.gz sandcrawler-97339426c4d0022c3fdf5948ef94b99bb1e120ee.zip |
null-body -> empty-blob
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 4 |
3 files changed, 8 insertions, 4 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 9696f3c..c3374f8 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -317,6 +317,10 @@ class WebFilesetStrategy(FilesetIngestStrategy): else: assert resource.terminal_status_code == 200 + if not resource.body: + m.status = "empty-blob" + continue + file_meta = gen_file_metadata(resource.body) file_meta, html_resource = fix_transfer_encoding(file_meta, resource) diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index d0c3e0e..857a212 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -683,7 +683,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -699,7 +699,7 @@ class IngestFileWorker(SandcrawlerWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 542dfbc..7337043 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -180,7 +180,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body: - result["status"] = "null-body" + result["status"] = "empty-blob" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: @@ -196,7 +196,7 @@ class IngestFilesetWorker(IngestFileWorker): return result if not resource.body or file_meta["size_bytes"] == 0: - result["status"] = "null-body" + result["status"] = "empty-blob" return result # here we split based on ingest type to try and extract a next hop |