From 5bd09c49aa5a29643f45db390ccf2f099b2d143d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 19 Feb 2020 15:18:07 -0800
Subject: filter out CDX rows missing WARC playback fields

---
 python/sandcrawler/ia.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'python')

diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 9a1b8c8..dbafa01 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -139,6 +139,10 @@ class CdxApiClient:
             else:
                 status_code = int(raw[4])
 
+            # CDX rows with no WARC records?
+            if raw[8] == '-' or raw[9] == '-' or raw[10] == '-':
+                continue
+
             row = CdxRow(
                 surt=raw[0],
                 datetime=raw[1],
-- 
cgit v1.2.3