diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-05 17:49:04 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-05 17:49:04 -0800 |
commit | 20bff1b2fc5e16677dee3deff3410cda91baa3e0 (patch) | |
tree | ae821a64681276f07eea3999ddb2b9ebb14cc9ee | |
parent | 4ee8e9364a99d02e22f295bdcf80aafce1ffc03f (diff) | |
download | sandcrawler-20bff1b2fc5e16677dee3deff3410cda91baa3e0.tar.gz sandcrawler-20bff1b2fc5e16677dee3deff3410cda91baa3e0.zip |
ia CDX: handle bad CDX rows
-rw-r--r-- | python/sandcrawler/ia.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index ba4ec31..806f1e7 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -167,8 +167,10 @@ class CdxApiClient: return None rows = [] for raw in rj[1:]: - assert len(raw) == 11 # JSON is short - #print(raw, file=sys.stderr) + # check number of CDX fields; there is a bug with some rows having + # spaces in WARC filename resulting in extra bogus fields + if len(raw) != 11: + raise CdxApiError(f"CDX response had {len(raw)} fields, not 11 expected") # transform "-" ftp status code to a 226 status_code = None |