aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-25 13:39:15 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-25 13:39:15 -0700
commit6a2630f236115b8c15b895f1506d02f1f110aa28 (patch)
tree04a61ee680ddb55b68f3c94b4b99f27c2cfc026f
parent452c9adfe00a1e64c78afecfcaa96ca3758c8086 (diff)
downloadsandcrawler-6a2630f236115b8c15b895f1506d02f1f110aa28.tar.gz
sandcrawler-6a2630f236115b8c15b895f1506d02f1f110aa28.zip
CDX: skip sha-256 digests
-rw-r--r--python/sandcrawler/ia.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 65f30e7..8c8e59e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -211,10 +211,14 @@ class CdxApiClient:
else:
status_code = int(raw[4])
- # CDX rows with no WARC records?
+ # remove CDX rows with no WARC records (?)
if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
continue
+ # remove CDX rows with SHA256 (not SHA1) digests
+ if raw[5].startswith("sha-256"):
+ continue
+
row = CdxRow(
surt=raw[0],
datetime=raw[1],