diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-25 13:39:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-25 13:39:15 -0700 |
commit | 6a2630f236115b8c15b895f1506d02f1f110aa28 (patch) | |
tree | 04a61ee680ddb55b68f3c94b4b99f27c2cfc026f /python | |
parent | 452c9adfe00a1e64c78afecfcaa96ca3758c8086 (diff) | |
download | sandcrawler-6a2630f236115b8c15b895f1506d02f1f110aa28.tar.gz sandcrawler-6a2630f236115b8c15b895f1506d02f1f110aa28.zip |
CDX: skip sha-256 digests
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 65f30e7..8c8e59e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -211,10 +211,14 @@ class CdxApiClient: else: status_code = int(raw[4]) - # CDX rows with no WARC records? + # remove CDX rows with no WARC records (?) if raw[8] == "-" or raw[9] == "-" or raw[10] == "-": continue + # remove CDX rows with SHA256 (not SHA1) digests + if raw[5].startswith("sha-256"): + continue + row = CdxRow( surt=raw[0], datetime=raw[1], |