aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-19 15:18:07 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-19 15:18:07 -0800
commit5bd09c49aa5a29643f45db390ccf2f099b2d143d (patch)
treec91bb7f72aa8c00ad7735bbc0fcdab2addfa6cc5
parentaf051a2f401b97919d5e073f0962d4147fbfac8b (diff)
downloadsandcrawler-5bd09c49aa5a29643f45db390ccf2f099b2d143d.tar.gz
sandcrawler-5bd09c49aa5a29643f45db390ccf2f099b2d143d.zip
filter out CDX rows missing WARC playback fields
-rw-r--r--python/sandcrawler/ia.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 9a1b8c8..dbafa01 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -139,6 +139,10 @@ class CdxApiClient:
else:
status_code = int(raw[4])
+ # CDX rows with no WARC records?
+ if raw[8] == '-' or raw[9] == '-' or raw[10] == '-':
+ continue
+
row = CdxRow(
surt=raw[0],
datetime=raw[1],