make fmt

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:54:37 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:54:37 -0700
commit: 05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
tree: abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts/deliver_gwb_to_s3.py
parent: f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
download: sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
1 files changed, 39 insertions, 27 deletions
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index f103205..f9b3b19 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -53,7 +53,6 @@ sentry_client = raven.Client()
 
 
 class DeliverGwbS3:
-
     def __init__(self, s3_bucket, **kwargs):
         self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
         self.rstore = None
@@ -61,7 +60,8 @@ class DeliverGwbS3:
         # /serve/ instead of /download/ doesn't record view count
         self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))
         self.s3_bucket = s3_bucket
         self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
         self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
@@ -71,37 +71,49 @@ class DeliverGwbS3:
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+                                               download_base_url=self.petabox_base_url))
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+            )
         except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".
+                format(ve))
         except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".
+                format(eofe))
         except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+                .format(te))
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
             return None, dict(status="error",
-                reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
+                              reason="archived HTTP response (WARC) was not 200",
+                              warc_status=gwb_record.get_status()[0])
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+                format(ire))
         return raw_content, None
 
     def run(self, manifest_file):
@@ -122,9 +134,11 @@ class DeliverGwbS3:
                 self.count['skip-warc'] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+                                                   file_cdx['c_size'])
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+                                                         status['reason']))
                 self.count['err-petabox-fetch'] += 1
                 continue
             elif not blob:
@@ -140,17 +154,14 @@ class DeliverGwbS3:
 
             self.count['petabox-ok'] += 1
             # upload to AWS S3
-            obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
-                Body=blob)
+            obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4],
+                                                                sha1_hex, self.s3_suffix),
+                                         Body=blob)
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
             self.count['success-s3'] += 1
         sys.stderr.write("{}\n".format(self.count))
 
+
 @sentry_client.capture_exceptions
 def main():
 
@@ -180,5 +191,6 @@ def main():
     worker = DeliverGwbS3(**args.__dict__)
     worker.run(args.manifest_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__':  # pragma: no cover
     main()
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:54:37 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:54:37 -0700
commit	05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
tree	abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts/deliver_gwb_to_s3.py
parent	f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
download	sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip