diff options
| -rw-r--r-- | python/sandcrawler/ia.py | 6 | 
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 65f30e7..8c8e59e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -211,10 +211,14 @@ class CdxApiClient:              else:                  status_code = int(raw[4]) -            # CDX rows with no WARC records? +            # remove CDX rows with no WARC records (?)              if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":                  continue +            # remove CDX rows with SHA256 (not SHA1) digests +            if raw[5].startswith("sha-256"): +                continue +              row = CdxRow(                  surt=raw[0],                  datetime=raw[1],  | 
