diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-11-01 16:43:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-11-01 16:43:40 -0700 |
commit | dcc0fe1a61c6816e519cfad95ec12d8abe5ddd29 (patch) | |
tree | 70d905f6c8096463155a7dc8ac1e908d82a5f52b | |
parent | a90b604c189bc5655d4a050a9241dfe0b34dbc5b (diff) | |
download | sandcrawler-dcc0fe1a61c6816e519cfad95ec12d8abe5ddd29.tar.gz sandcrawler-dcc0fe1a61c6816e519cfad95ec12d8abe5ddd29.zip |
sandcrawler: try to handle weird CDX API response
Hard to debug this because sentry is broken.
-rw-r--r-- | python/sandcrawler/ia.py | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 672a0b6..3ab4971 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -219,6 +219,11 @@ class CdxApiClient: if raw[5].startswith("sha-256"): continue + # remove CDX rows with 'error' digests + # TODO: follow-up on this (2022-11-01 sandcrawler errors) + if raw[5].lower() == "error": + continue + row = CdxRow( surt=raw[0], datetime=raw[1], |