diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-02-19 18:38:57 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-02-19 18:38:57 -0800 |
commit | ab95b4b3ae669424581a55668a819eff03098dae (patch) | |
tree | 3b660249dffc14b2082a75f4db8bd6ad2e682621 /python | |
parent | 68b3523dba25bd76b4ceedd5a4d9cd5620259fd3 (diff) | |
download | sandcrawler-ab95b4b3ae669424581a55668a819eff03098dae.tar.gz sandcrawler-ab95b4b3ae669424581a55668a819eff03098dae.zip |
fix empty blob errors
Diffstat (limited to 'python')
-rwxr-xr-x | python/deliver_gwb_to_s3.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/deliver_gwb_to_s3.py b/python/deliver_gwb_to_s3.py index 02f0c03..b842f97 100755 --- a/python/deliver_gwb_to_s3.py +++ b/python/deliver_gwb_to_s3.py @@ -115,10 +115,14 @@ class DeliverGwbS3(): continue # fetch from GWB/petabox via HTTP range-request blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) - if not blob: + if blob is None and status: print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) self.count['err-petabox'] += 1 continue + elif not blob: + print("{}\tskip-empty-blob".format(sha1_hex) + self.count['skip-empty-blob'] += 1 + continue # verify sha1 if sha1_hex != hashlib.sha1(blob).hexdigest(): #assert sha1_hex == hashlib.sha1(blob).hexdigest() |