aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-18 13:44:01 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-18 13:44:01 -0700
commit8e4a39cfce3d9ba1bec98855831be2cebdd951be (patch)
treecc81226062de01cc7b131e773a7b5772a0e109bf
parent9249f8535732923f2f147c68bb8523d52080f6e8 (diff)
downloadsandcrawler-8e4a39cfce3d9ba1bec98855831be2cebdd951be.tar.gz
sandcrawler-8e4a39cfce3d9ba1bec98855831be2cebdd951be.zip
ingest: record bad GZIP transfer decode, instead of crashing (HTML)
-rw-r--r--python/sandcrawler/ingest_html.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 25c6c89..1c2c3fd 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -200,7 +200,10 @@ def fetch_html_resources(
# either the transfer-encoded or inner (un-encoded) payload body to
# match. This is because of an ambiguity in the WARC specification
outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
- file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ try:
+ file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ except Exception as e:
+ raise WaybackContentError(f"bad gzip encoding: {e}")
if (
file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex