aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/ia.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index ba4ec31..806f1e7 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -167,8 +167,10 @@ class CdxApiClient:
return None
rows = []
for raw in rj[1:]:
- assert len(raw) == 11 # JSON is short
- #print(raw, file=sys.stderr)
+ # check number of CDX fields; there is a bug with some rows having
+ # spaces in WARC filename resulting in extra bogus fields
+ if len(raw) != 11:
+ raise CdxApiError(f"CDX response had {len(raw)} fields, not 11 expected")
# transform "-" ftp status code to a 226
status_code = None