aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-05 17:49:04 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-05 17:49:04 -0800
commit20bff1b2fc5e16677dee3deff3410cda91baa3e0 (patch)
treeae821a64681276f07eea3999ddb2b9ebb14cc9ee
parent4ee8e9364a99d02e22f295bdcf80aafce1ffc03f (diff)
downloadsandcrawler-20bff1b2fc5e16677dee3deff3410cda91baa3e0.tar.gz
sandcrawler-20bff1b2fc5e16677dee3deff3410cda91baa3e0.zip
ia CDX: handle bad CDX rows
-rw-r--r--python/sandcrawler/ia.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index ba4ec31..806f1e7 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -167,8 +167,10 @@ class CdxApiClient:
return None
rows = []
for raw in rj[1:]:
- assert len(raw) == 11 # JSON is short
- #print(raw, file=sys.stderr)
+ # check number of CDX fields; there is a bug with some rows having
+ # spaces in WARC filename resulting in extra bogus fields
+ if len(raw) != 11:
+ raise CdxApiError(f"CDX response had {len(raw)} fields, not 11 expected")
# transform "-" ftp status code to a 226
status_code = None