From 8e4a39cfce3d9ba1bec98855831be2cebdd951be Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 18 Jul 2022 13:44:01 -0700 Subject: ingest: record bad GZIP transfer decode, instead of crashing (HTML) --- python/sandcrawler/ingest_html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 25c6c89..1c2c3fd 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -200,7 +200,10 @@ def fetch_html_resources( # either the transfer-encoded or inner (un-encoded) payload body to # match. This is because of an ambiguity in the WARC specification outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True) - file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp) + try: + file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp) + except Exception as e: + raise WaybackContentError(f"bad gzip encoding: {e}") if ( file_meta["sha1hex"] != wayback_resp.cdx.sha1hex and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex -- cgit v1.2.3