diff options
Diffstat (limited to 'python/scripts/deliver_gwb_to_s3.py')
| -rwxr-xr-x | python/scripts/deliver_gwb_to_s3.py | 66 | 
1 files changed, 39 insertions, 27 deletions
| diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index f103205..f9b3b19 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -53,7 +53,6 @@ sentry_client = raven.Client()  class DeliverGwbS3: -      def __init__(self, s3_bucket, **kwargs):          self.warc_uri_prefix = kwargs.get('warc_uri_prefix')          self.rstore = None @@ -61,7 +60,8 @@ class DeliverGwbS3:          # /serve/ instead of /download/ doesn't record view count          self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')          # gwb library will fall back to reading from /opt/.petabox/webdata.secret -        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) +        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', +                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))          self.s3_bucket = s3_bucket          self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')          self.s3_suffix = kwargs.get('s3_suffix', '.pdf') @@ -71,37 +71,49 @@ class DeliverGwbS3:      def fetch_warc_content(self, warc_path, offset, c_size):          warc_uri = self.warc_uri_prefix + warc_path          if not self.rstore: -            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( -                webdata_secret=self.petabox_webdata_secret, -                download_base_url=self.petabox_base_url)) +            self.rstore = ResourceStore( +                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret, +                                               download_base_url=self.petabox_base_url))          try:              gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)          except wayback.exception.ResourceUnavailable: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)" +            )          except ValueError as ve: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (ValueError: {})". +                format(ve))          except EOFError as eofe: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (EOFError: {})". +                format(eofe))          except TypeError as te: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) +            return None, dict( +                status="error", +                reason= +                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)" +                .format(te))          # Note: could consider a generic "except Exception" here, as we get so          # many petabox errors. Do want jobs to fail loud and clear when the          # whole cluster is down though.          if gwb_record.get_status()[0] != 200:              return None, dict(status="error", -                reason="archived HTTP response (WARC) was not 200", -                warc_status=gwb_record.get_status()[0]) +                              reason="archived HTTP response (WARC) was not 200", +                              warc_status=gwb_record.get_status()[0])          try:              raw_content = gwb_record.open_raw_content().read()          except IncompleteRead as ire: -            return None, dict(status="error", -                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) +            return None, dict( +                status="error", +                reason= +                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})". +                format(ire))          return raw_content, None      def run(self, manifest_file): @@ -122,9 +134,11 @@ class DeliverGwbS3:                  self.count['skip-warc'] += 1                  continue              # fetch from GWB/petabox via HTTP range-request -            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) +            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], +                                                   file_cdx['c_size'])              if blob is None and status: -                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) +                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], +                                                         status['reason']))                  self.count['err-petabox-fetch'] += 1                  continue              elif not blob: @@ -140,17 +154,14 @@ class DeliverGwbS3:              self.count['petabox-ok'] += 1              # upload to AWS S3 -            obj = self.bucket.put_object( -                Key="{}{}/{}{}".format( -                    self.s3_prefix, -                    sha1_hex[0:4], -                    sha1_hex, -                    self.s3_suffix), -                Body=blob) +            obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], +                                                                sha1_hex, self.s3_suffix), +                                         Body=blob)              print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))              self.count['success-s3'] += 1          sys.stderr.write("{}\n".format(self.count)) +  @sentry_client.capture_exceptions  def main(): @@ -180,5 +191,6 @@ def main():      worker = DeliverGwbS3(**args.__dict__)      worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == '__main__':  # pragma: no cover      main() | 
