aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-02-19 18:38:57 -0800
committerBryan Newbold <bnewbold@archive.org>2019-02-19 18:38:57 -0800
commitab95b4b3ae669424581a55668a819eff03098dae (patch)
tree3b660249dffc14b2082a75f4db8bd6ad2e682621 /python
parent68b3523dba25bd76b4ceedd5a4d9cd5620259fd3 (diff)
downloadsandcrawler-ab95b4b3ae669424581a55668a819eff03098dae.tar.gz
sandcrawler-ab95b4b3ae669424581a55668a819eff03098dae.zip
fix empty blob errors
Diffstat (limited to 'python')
-rwxr-xr-xpython/deliver_gwb_to_s3.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/deliver_gwb_to_s3.py b/python/deliver_gwb_to_s3.py
index 02f0c03..b842f97 100755
--- a/python/deliver_gwb_to_s3.py
+++ b/python/deliver_gwb_to_s3.py
@@ -115,10 +115,14 @@ class DeliverGwbS3():
continue
# fetch from GWB/petabox via HTTP range-request
blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
- if not blob:
+ if blob is None and status:
print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
self.count['err-petabox'] += 1
continue
+ elif not blob:
+ print("{}\tskip-empty-blob".format(sha1_hex)
+ self.count['skip-empty-blob'] += 1
+ continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
#assert sha1_hex == hashlib.sha1(blob).hexdigest()