diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-19 15:18:07 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-19 15:18:07 -0800 |
commit | 5bd09c49aa5a29643f45db390ccf2f099b2d143d (patch) | |
tree | c91bb7f72aa8c00ad7735bbc0fcdab2addfa6cc5 /python | |
parent | af051a2f401b97919d5e073f0962d4147fbfac8b (diff) | |
download | sandcrawler-5bd09c49aa5a29643f45db390ccf2f099b2d143d.tar.gz sandcrawler-5bd09c49aa5a29643f45db390ccf2f099b2d143d.zip |
filter out CDX rows missing WARC playback fields
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 9a1b8c8..dbafa01 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -139,6 +139,10 @@ class CdxApiClient: else: status_code = int(raw[4]) + # CDX rows with no WARC records? + if raw[8] == '-' or raw[9] == '-' or raw[10] == '-': + continue + row = CdxRow( surt=raw[0], datetime=raw[1], |