From da12c99e0d9cdbdc8868a94f8d78b6cd3b2653fa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 16 Jul 2022 13:08:05 -0700 Subject: html ingest: allow fuzzy CDX sha1 match based on encoding/not-encoding --- python/sandcrawler/ingest_html.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 0ff7fe0..25c6c89 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -196,10 +196,17 @@ def fetch_html_resources( wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest) if not wayback_resp or wayback_resp.status != "success": raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") - file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True) - if file_meta["sha1hex"] != wayback_resp.cdx.sha1hex: + # for HTML sub-resources specifically, we allow the CDX SHA1 to match + # either the transfer-encoded or inner (un-encoded) payload body to + # match. This is because of an ambiguity in the WARC specification + outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True) + file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp) + if ( + file_meta["sha1hex"] != wayback_resp.cdx.sha1hex + and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex + ): raise WaybackContentError( - f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}" + f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url} found:{file_meta['sha1hex']} expected:{wayback_resp.cdx.sha1hex}" ) full.append( WebResource( -- cgit v1.2.3