From 20bff1b2fc5e16677dee3deff3410cda91baa3e0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 5 Jan 2021 17:49:04 -0800 Subject: ia CDX: handle bad CDX rows --- python/sandcrawler/ia.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index ba4ec31..806f1e7 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -167,8 +167,10 @@ class CdxApiClient: return None rows = [] for raw in rj[1:]: - assert len(raw) == 11 # JSON is short - #print(raw, file=sys.stderr) + # check number of CDX fields; there is a bug with some rows having + # spaces in WARC filename resulting in extra bogus fields + if len(raw) != 11: + raise CdxApiError(f"CDX response had {len(raw)} fields, not 11 expected") # transform "-" ftp status code to a 226 status_code = None -- cgit v1.2.3