From dcc0fe1a61c6816e519cfad95ec12d8abe5ddd29 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 1 Nov 2022 16:43:38 -0700 Subject: sandcrawler: try to handle weird CDX API response Hard to debug this because sentry is broken. --- python/sandcrawler/ia.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 672a0b6..3ab4971 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -219,6 +219,11 @@ class CdxApiClient: if raw[5].startswith("sha-256"): continue + # remove CDX rows with 'error' digests + # TODO: follow-up on this (2022-11-01 sandcrawler errors) + if raw[5].lower() == "error": + continue + row = CdxRow( surt=raw[0], datetime=raw[1], -- cgit v1.2.3