From 5bd09c49aa5a29643f45db390ccf2f099b2d143d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 19 Feb 2020 15:18:07 -0800 Subject: filter out CDX rows missing WARC playback fields --- python/sandcrawler/ia.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 9a1b8c8..dbafa01 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -139,6 +139,10 @@ class CdxApiClient: else: status_code = int(raw[4]) + # CDX rows with no WARC records? + if raw[8] == '-' or raw[9] == '-' or raw[10] == '-': + continue + row = CdxRow( surt=raw[0], datetime=raw[1], -- cgit v1.2.3