aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/deliver_gwb_to_s3.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
commit05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
treeabcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts/deliver_gwb_to_s3.py
parentf3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
downloadsandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
make fmt
Diffstat (limited to 'python/scripts/deliver_gwb_to_s3.py')
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py66
1 files changed, 39 insertions, 27 deletions
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index f103205..f9b3b19 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -53,7 +53,6 @@ sentry_client = raven.Client()
class DeliverGwbS3:
-
def __init__(self, s3_bucket, **kwargs):
self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
self.rstore = None
@@ -61,7 +60,8 @@ class DeliverGwbS3:
# /serve/ instead of /download/ doesn't record view count
self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+ os.environ.get('PETABOX_WEBDATA_SECRET'))
self.s3_bucket = s3_bucket
self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
@@ -71,37 +71,49 @@ class DeliverGwbS3:
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".
+ format(ve))
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".
+ format(eofe))
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason=
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+ .format(te))
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",
- reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason=
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+ format(ire))
return raw_content, None
def run(self, manifest_file):
@@ -122,9 +134,11 @@ class DeliverGwbS3:
self.count['skip-warc'] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+ file_cdx['c_size'])
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+ print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+ status['reason']))
self.count['err-petabox-fetch'] += 1
continue
elif not blob:
@@ -140,17 +154,14 @@ class DeliverGwbS3:
self.count['petabox-ok'] += 1
# upload to AWS S3
- obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
- Body=blob)
+ obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4],
+ sha1_hex, self.s3_suffix),
+ Body=blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
self.count['success-s3'] += 1
sys.stderr.write("{}\n".format(self.count))
+
@sentry_client.capture_exceptions
def main():
@@ -180,5 +191,6 @@ def main():
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__': # pragma: no cover
main()