aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-11-01 16:43:38 -0700
committerBryan Newbold <bnewbold@archive.org>2022-11-01 16:43:40 -0700
commitdcc0fe1a61c6816e519cfad95ec12d8abe5ddd29 (patch)
tree70d905f6c8096463155a7dc8ac1e908d82a5f52b /python
parenta90b604c189bc5655d4a050a9241dfe0b34dbc5b (diff)
downloadsandcrawler-dcc0fe1a61c6816e519cfad95ec12d8abe5ddd29.tar.gz
sandcrawler-dcc0fe1a61c6816e519cfad95ec12d8abe5ddd29.zip
sandcrawler: try to handle weird CDX API response
Hard to debug this because sentry is broken.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 672a0b6..3ab4971 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -219,6 +219,11 @@ class CdxApiClient:
if raw[5].startswith("sha-256"):
continue
+ # remove CDX rows with 'error' digests
+ # TODO: follow-up on this (2022-11-01 sandcrawler errors)
+ if raw[5].lower() == "error":
+ continue
+
row = CdxRow(
surt=raw[0],
datetime=raw[1],