make fmt

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:54:37 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:54:37 -0700
commit: 05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
tree: abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts
parent: f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
download: sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
19 files changed, 230 insertions, 186 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 69fe320..9cc9055 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 This script is intended to be used for backfill ingest of old crawls. It can
 also be used as a fast path for getting freshly crawled content into fatcat if
@@ -36,37 +35,35 @@ def run(args):
             },
         }
         if args.release_stage:
-            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
+            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft',
+                                          'update')
             request['release_stage'] = args.release_stage
 
         print("{}".format(json.dumps(request, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--link-source',
-        required=True,
-        help="link_source to include in request")
-    parser.add_argument('--extid-type',
-        required=True,
-        help="extid to encode identifier as")
+                        required=True,
+                        help="link_source to include in request")
+    parser.add_argument('--extid-type', required=True, help="extid to encode identifier as")
     parser.add_argument('--ingest-type',
-        default="pdf",
-        help="ingest type (pdf, html, xml, etc)")
+                        default="pdf",
+                        help="ingest type (pdf, html, xml, etc)")
     parser.add_argument('--ingest-request-source',
-        default="arabesque",
-        help="to include in request")
-    parser.add_argument('--release-stage',
-        default=None,
-        help="to include in request")
+                        default="arabesque",
+                        help="to include in request")
+    parser.add_argument('--release-stage', default=None, help="to include in request")
     parser.add_argument('json_file',
-        help="arabesque output file to use",
-        type=argparse.FileType('r'))
+                        help="arabesque output file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 86ca062..83c04e3 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -23,11 +23,9 @@ FORMAT_TO_MIMETYPE = {
     'RAR': 'application/vnd.rar',
     'TAR': 'application/x-tar',
     '7z': 'application/x-7z-compressed',
-
     'HTML': 'text/html',
     'Text': 'text/plain',
     'PDF': 'application/pdf',
-
     'CSV': 'text/csv',
     'XML': 'application/xml',
     'JSON': 'application/json',
@@ -36,20 +34,17 @@ FORMAT_TO_MIMETYPE = {
     #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
     #'application/vnd.ms-excel', # .xls
     #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
-
-    'MP3': 'audio/mpeg', # .mp3
-
-    'MP4': 'video/mp4', # .mp4
-    'MPEG': 'video/mpeg', # .mpeg
-
+    'MP3': 'audio/mpeg',  # .mp3
+    'MP4': 'video/mp4',  # .mp4
+    'MPEG': 'video/mpeg',  # .mpeg
     'JPEG': 'image/jpeg',
     'GIF': 'image/gif',
     'PNG': 'image/png',
     'TIFF': 'image/tiff',
-
     'Unknown': None,
 }
 
+
 def want_file(f: dict, item_name: str) -> bool:
     """
     Filters IA API files
@@ -57,12 +52,12 @@ def want_file(f: dict, item_name: str) -> bool:
     if f.source != 'original':
         return False
     for suffix in [
-        '_meta.sqlite',
-        '_archive.torrent',
-        '_itemimage.jpg',
-        '_meta.xml',
-        '_thumb.png',
-        '_files.xml',
+            '_meta.sqlite',
+            '_archive.torrent',
+            '_itemimage.jpg',
+            '_meta.xml',
+            '_thumb.png',
+            '_files.xml',
     ]:
         if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
             return False
@@ -74,6 +69,7 @@ def want_file(f: dict, item_name: str) -> bool:
                 return False
     return True
 
+
 def parse_file(f: dict) -> dict:
     """
     Takes an IA API file and turns it in to a fatcat fileset manifest file
@@ -93,6 +89,7 @@ def parse_file(f: dict) -> dict:
         mf['extra'] = dict(mimetype=mimetype)
     return mf
 
+
 def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
     print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
     if release_id.startswith('release_'):
@@ -104,18 +101,17 @@ def item_to_fileset(item_name: str, release_id: str, session: internetarchive.Ar
     manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
     fileset = {
         'manifest': manifest,
-        'urls': [
-            {
-                'rel': 'archive',
-                'url': f'https://archive.org/download/{item_name}/',
-            },
-        ],
+        'urls': [{
+            'rel': 'archive',
+            'url': f'https://archive.org/download/{item_name}/',
+        }, ],
         'release_ids': [release_id],
         #extra={},
     }
     print(json.dumps(fileset))
     return fileset
 
+
 def main():
     session = internetarchive.get_session()
     if len(sys.argv) == 3:
@@ -133,5 +129,6 @@ def main():
             release_id = fields[1]
             item_to_fileset(item_name, release_id=release_id, session=session)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index 5e33def..aa78aec 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -35,9 +35,7 @@ def run():
     print("Looking up collection: {}".format(collection))
 
     # First fetch list
-    item_list = list(
-        ia.search_items(
-            query="collection:{} mediatype:web".format(collection)))
+    item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
 
     if len(item_list) == 0:
         print("No items found, bailing")
@@ -50,11 +48,12 @@ def run():
         item = item['identifier']
         # TODO: error handling
         try:
-            ret = ia.download(item, files=[item + '.cdx.gz'],
-                verbose=True,
-                destdir=tempdir,
-                no_directory=True,
-                retries=1000)
+            ret = ia.download(item,
+                              files=[item + '.cdx.gz'],
+                              verbose=True,
+                              destdir=tempdir,
+                              no_directory=True,
+                              retries=1000)
             status = ret and status
         except requests.exceptions.ReadTimeout as rt:
             print(str(rt), file=sys.stderr)
@@ -69,14 +68,13 @@ def run():
     # Combine files
     print("Merging and re-compressing all CDX files...")
     #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
-    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
-        shell=True)
+    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True)
 
     # Move and cleanup
-    shutil.move('{}/combined.gz'.format(tempdir),
-                '{}.cdx.gz'.format(collection))
+    shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection))
 
     print("Done!")
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 1b7c85c..4714b60 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an unpaywall dump (JSON) into ingest requests.
 """
@@ -21,7 +20,6 @@ def transform_cnki(obj):
     requests = []
     assert obj['cnki_id']
 
-
     requests = []
     requests.append({
         'base_url': canon(obj['info_url']),
@@ -41,6 +39,7 @@ def transform_cnki(obj):
 
     return requests
 
+
 def transform_wanfang(obj):
 
     assert obj['wanfang_id']
@@ -68,17 +67,18 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="COVID-19 metadata file to use",
-        type=argparse.FileType('r'))
+                        help="COVID-19 metadata file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 62a85e6..3b53235 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -49,7 +49,6 @@ def b32_hex(s):
 
 
 class DeliverDumpGrobidS3():
-
     def __init__(self, s3_bucket, **kwargs):
         self.rstore = None
         self.count = Counter()
@@ -80,11 +79,7 @@ class DeliverDumpGrobidS3():
             tei_xml = tei_xml.encode('utf-8')
             # upload to AWS S3
             obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
+                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
                 Body=tei_xml,
                 StorageClass=self.s3_storage_class,
             )
@@ -92,6 +87,7 @@ class DeliverDumpGrobidS3():
             self.count['success-s3'] += 1
         sys.stderr.write("{}\n".format(self.count))
 
+
 @sentry_client.capture_exceptions
 def main():
 
@@ -121,5 +117,6 @@ def main():
     worker = DeliverDumpGrobidS3(**args.__dict__)
     worker.run(args.dump_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__':  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index ab1906a..ca19b97 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -26,7 +26,6 @@ sentry_client = raven.Client()
 
 
 class DeliverGwbDisk:
-
     def __init__(self, disk_dir, **kwargs):
         self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
         self.rstore = None
@@ -34,7 +33,8 @@ class DeliverGwbDisk:
         # /serve/ instead of /download/ doesn't record view count
         self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))
         self.disk_dir = disk_dir
         self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
         self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
@@ -42,48 +42,56 @@ class DeliverGwbDisk:
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+                                               download_base_url=self.petabox_base_url))
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+            )
         except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".
+                format(ve))
         except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".
+                format(eofe))
         except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+                .format(te))
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
             return None, dict(status="error",
-                reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
+                              reason="archived HTTP response (WARC) was not 200",
+                              warc_status=gwb_record.get_status()[0])
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+                format(ire))
         return raw_content, None
 
     def run(self, manifest_file):
         sys.stderr.write("Ensuring all 65536 base directories exist...\n")
         for i in range(256):
             for j in range(256):
-                fpath = "{}/{}{:02x}/{:02x}".format(
-                        self.disk_dir,
-                        self.disk_prefix,
-                        i,
-                        j)
+                fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
                 os.makedirs(fpath, exist_ok=True)
         sys.stderr.write("Starting...\n")
         for line in manifest_file:
@@ -102,9 +110,11 @@ class DeliverGwbDisk:
                 self.count['skip-warc'] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+                                                   file_cdx['c_size'])
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+                                                         status['reason']))
                 self.count['err-petabox-fetch'] += 1
                 continue
             elif not blob:
@@ -120,19 +130,15 @@ class DeliverGwbDisk:
 
             self.count['petabox-ok'] += 1
             # save to disk
-            fpath = "{}/{}{}/{}/{}{}".format(
-                    self.disk_dir,
-                    self.disk_prefix,
-                    sha1_hex[0:2],
-                    sha1_hex[2:4],
-                    sha1_hex,
-                    self.disk_suffix)
+            fpath = "{}/{}{}/{}/{}{}".format(self.disk_dir, self.disk_prefix, sha1_hex[0:2],
+                                             sha1_hex[2:4], sha1_hex, self.disk_suffix)
             with open(fpath, 'wb') as f:
                 f.write(blob)
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
             self.count['success-disk'] += 1
         sys.stderr.write("{}\n".format(self.count))
 
+
 @sentry_client.capture_exceptions
 def main():
 
@@ -162,5 +168,6 @@ def main():
     worker = DeliverGwbDisk(**args.__dict__)
     worker.run(args.manifest_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__':  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index f103205..f9b3b19 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -53,7 +53,6 @@ sentry_client = raven.Client()
 
 
 class DeliverGwbS3:
-
     def __init__(self, s3_bucket, **kwargs):
         self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
         self.rstore = None
@@ -61,7 +60,8 @@ class DeliverGwbS3:
         # /serve/ instead of /download/ doesn't record view count
         self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))
         self.s3_bucket = s3_bucket
         self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
         self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
@@ -71,37 +71,49 @@ class DeliverGwbS3:
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+                                               download_base_url=self.petabox_base_url))
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+            )
         except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".
+                format(ve))
         except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".
+                format(eofe))
         except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+                .format(te))
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
             return None, dict(status="error",
-                reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
+                              reason="archived HTTP response (WARC) was not 200",
+                              warc_status=gwb_record.get_status()[0])
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+                format(ire))
         return raw_content, None
 
     def run(self, manifest_file):
@@ -122,9 +134,11 @@ class DeliverGwbS3:
                 self.count['skip-warc'] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+                                                   file_cdx['c_size'])
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+                                                         status['reason']))
                 self.count['err-petabox-fetch'] += 1
                 continue
             elif not blob:
@@ -140,17 +154,14 @@ class DeliverGwbS3:
 
             self.count['petabox-ok'] += 1
             # upload to AWS S3
-            obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
-                Body=blob)
+            obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4],
+                                                                sha1_hex, self.s3_suffix),
+                                         Body=blob)
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
             self.count['success-s3'] += 1
         sys.stderr.write("{}\n".format(self.count))
 
+
 @sentry_client.capture_exceptions
 def main():
 
@@ -180,5 +191,6 @@ def main():
     worker = DeliverGwbS3(**args.__dict__)
     worker.run(args.manifest_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__':  # pragma: no cover
     main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index 15b30a0..84a2c2c 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an DOAJ article dump (JSON) into ingest requests.
 
@@ -42,22 +41,22 @@ CONTENT_TYPE_MAP = {
     "abstract": [],
     "doc": [],
     "": ["pdf"],
-
     "doi": ["pdf"],
     "url": ["pdf"],
     "fulltext": ["pdf"],
     "anySimpleType": ["pdf"],
-
     "application/pdf": ["pdf"],
     "html": ["html", "pdf"],
     "text/html": ["html", "pdf"],
     "xml": ["xml"],
 }
 
+
 def canon(s: str) -> str:
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj: dict) -> List[dict]:
     """
     Transforms from a single DOAJ object to zero or more ingest requests.
@@ -118,6 +117,7 @@ def transform(obj: dict) -> List[dict]:
 
     return requests
 
+
 def run(args) -> None:
     for l in args.json_file:
         if not l.strip():
@@ -128,17 +128,18 @@ def run(args) -> None:
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="DOAJ article dump file to use",
-        type=argparse.FileType('r'))
+                        help="DOAJ article dump file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 3085346..54c3d5f 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -34,13 +34,13 @@ def run():
 
         sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
 
-        obj = dict(
-            sha1=sha1,
-            dois=dois,
-            cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
-            size=size,
-            mimetype=mimetype)
+        obj = dict(sha1=sha1,
+                   dois=dois,
+                   cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+                   size=size,
+                   mimetype=mimetype)
         print(json.dumps(obj))
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index d0666ce..a474393 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -24,6 +24,7 @@ NAME_DENYLIST = (
     'phdstudent',
 )
 
+
 def tokenize(s, remove_whitespace=True):
 
     s.replace('&apos;', "'")
@@ -36,9 +37,11 @@ def tokenize(s, remove_whitespace=True):
     # Encode as dumb ASCII (TODO: this is horrible)
     return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
 
+
 assert tokenize("Impact Factor: 2.114") == "impactfactor"
 assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
 
+
 def filter_title(title):
 
     title = title.strip()
@@ -83,19 +86,23 @@ def filter_title(title):
 
     return title
 
+
 def filter_author_name(name):
     name = name['name']
     if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
         return None
     return ' '.join([t for t in name.split() if tokenize(t)])
 
+
 def filter_authors(l):
     return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
 
+
 def filter_refs(l):
     # TODO:
     return l
 
+
 def filter_journal_name(name):
     # same denylist, for now
     if not name:
@@ -104,10 +111,12 @@ def filter_journal_name(name):
     slug_name = tokenize(name)
     if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
         return None
-    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ",
+                   "Research Article ", "Available online www.jocpr.com "):
         if name.startswith(prefix):
             name = name.replace(prefix, '')
-    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+    for suffix in (" Available online at www.sciarena.com", " Original Article",
+                   " Available online at", " ISSN", " ISSUE"):
         if name.endswith(suffix):
             name = name.replace(suffix, '')
     if "====================" in name:
@@ -116,6 +125,7 @@ def filter_journal_name(name):
         return None
     return ' '.join(name.split())
 
+
 def filter_metadata(obj):
     if not (obj.get('title') and obj.get('authors')):
         return None
@@ -132,6 +142,7 @@ def filter_metadata(obj):
 
     return obj
 
+
 def run(invert=False):
     for line in sys.stdin:
         fields = line.split('\t')
@@ -155,5 +166,6 @@ def run(invert=False):
         elif invert:
             print(raw.strip())
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index 494da71..fda9098 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -28,6 +28,7 @@ MAX_SLUG_LINES = 50
 
 REQUIRE_AUTHORS = False
 
+
 def tokenize(s, remove_whitespace=False):
 
     s.replace('&apos;', "'")
@@ -40,6 +41,7 @@ def tokenize(s, remove_whitespace=False):
     # Encode as dumb ASCII (TODO: this is horrible)
     return s.encode('ascii', 'replace').replace(b'?', b'')
 
+
 def check_authors(left, right):
     """
     Intended to check GROBID extracted authors (right) against "known good"
@@ -63,6 +65,7 @@ def check_authors(left, right):
             return False
     return True
 
+
 def test_check_authors():
     assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
     assert not check_authors([], ['one'])
@@ -74,6 +77,7 @@ def test_check_authors():
     assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
     assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
 
+
 # Rows are (score, left, right)
 def process_group(rows):
 
@@ -119,6 +123,7 @@ def process_group(rows):
 
     print(json.dumps([releases[ident] for ident in group_ids]))
 
+
 def run():
 
     last_slug = None
@@ -140,5 +145,6 @@ def run():
     if lines:
         process_group(lines)
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index abf81bd..3251852 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -33,6 +33,7 @@ def tokenize(s, remove_whitespace=False):
     # Encode as dumb ASCII (TODO: this is horrible)
     return s.encode('ascii', 'replace').replace(b'?', b'')
 
+
 def check_authors(left, right):
     """
     Intended to check GROBID extracted authors (right) against "known good"
@@ -56,6 +57,7 @@ def check_authors(left, right):
             return False
     return True
 
+
 def test_check_authors():
     assert not check_authors([], [])
     assert not check_authors([], ['one'])
@@ -67,6 +69,7 @@ def test_check_authors():
     assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
     assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
 
+
 # Rows are (score, grobid, crossref)
 def process_group(rows):
     if len(rows) > max_slug_lines:
@@ -92,6 +95,7 @@ def process_group(rows):
     for sha1, doi_list in keepers.items():
         print("{}\t{}".format(sha1, json.dumps(doi_list)))
 
+
 def run():
 
     last_slug = None
@@ -112,5 +116,6 @@ def run():
     if lines:
         process_group(lines)
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index d391f60..b42153c 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
 output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -24,10 +23,12 @@ def parse_hbase(line):
     tei_xml = obj['tei_xml']
     return sha1hex, tei_xml
 
+
 def parse_pg(line):
     obj = json.loads(line)
     return obj['sha1hex'], obj['tei_xml']
 
+
 def run(mode='hbase'):
     for line in sys.stdin:
         if mode == 'hbase':
@@ -49,5 +50,6 @@ def run(mode='hbase'):
             affiliations = [json.loads(a) for a in affiliations]
             print('\t'.join([sha1hex, json.dumps(affiliations)]))
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 8aee0be..c9bc134 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -4,7 +4,8 @@ import datetime
 import json
 import sys
 
-MAX_ABSTRACT_BYTES=4096
+MAX_ABSTRACT_BYTES = 4096
+
 
 def parse_grobid_json(obj):
 
@@ -14,10 +15,7 @@ def parse_grobid_json(obj):
     extra = dict()
 
     if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
-        abobj = dict(
-            mimetype="text/plain",
-            language=None,
-            content=obj.get('abstract').strip())
+        abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip())
         abstracts = [abobj]
     else:
         abstracts = None
@@ -72,16 +70,16 @@ def parse_grobid_json(obj):
     else:
         extra = None
 
-    return dict(
-        title=obj['title'].strip(),
-        contribs=contribs,
-        publisher=obj['journal'].get('publisher'),
-        volume=obj['journal'].get('volume'),
-        issue=obj['journal'].get('issue'),
-        abstracts=abstracts,
-        release_type=release_type,
-        release_date=release_date,
-        extra=extra)
+    return dict(title=obj['title'].strip(),
+                contribs=contribs,
+                publisher=obj['journal'].get('publisher'),
+                volume=obj['journal'].get('volume'),
+                issue=obj['journal'].get('issue'),
+                abstracts=abstracts,
+                release_type=release_type,
+                release_date=release_date,
+                extra=extra)
+
 
 def run():
     for line in sys.stdin:
@@ -90,5 +88,6 @@ def run():
         if out:
             print(out)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index acba2a8..70731d5 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 This script is used to turn ingest request postgres rows (in JSON export
 format) back in to regular ingest request JSON.
@@ -25,6 +24,7 @@ def transform(row):
         row['fatcat'] = dict(release_ident=extra['release_ident'])
     return row
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -35,17 +35,18 @@ def run(args):
             print(l, file=sys.stderr)
         print(json.dumps(req, sort_keys=True))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="arabesque output file to use",
-        type=argparse.FileType('r'))
+                        help="arabesque output file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 8267003..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -20,6 +20,7 @@ import sys
 # 2. select all file metadata
 # 3. output object
 
+
 def or_none(s):
     if s is None:
         return None
@@ -27,6 +28,7 @@ def or_none(s):
         return None
     return s
 
+
 def process_db(db_path):
 
     db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
         dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
         print(json.dumps(obj))
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 315b8d2..1f4a19f 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an OAI-PMH bulk dump (JSON) into ingest requests.
 
@@ -33,17 +32,19 @@ DOMAIN_BLOCKLIST = [
 ]
 
 RELEASE_STAGE_MAP = {
-    'info:eu-repo/semantics/draftVersion':     'draft',
+    'info:eu-repo/semantics/draftVersion': 'draft',
     'info:eu-repo/semantics/submittedVersion': 'submitted',
-    'info:eu-repo/semantics/acceptedVersion':  'accepted',
+    'info:eu-repo/semantics/acceptedVersion': 'accepted',
     'info:eu-repo/semantics/publishedVersion': 'published',
-    'info:eu-repo/semantics/updatedVersion':   'updated',
+    'info:eu-repo/semantics/updatedVersion': 'updated',
 }
 
+
 def canon(s):
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj):
     """
     Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -112,6 +113,7 @@ def transform(obj):
 
     return requests
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -122,17 +124,18 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="OAI-PMH dump file to use (usually stdin)",
-        type=argparse.FileType('r'))
+                        help="OAI-PMH dump file to use (usually stdin)",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index 71fbe54..3f81b3b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
 
@@ -23,12 +22,14 @@ def run(inpath, outpath):
 
     renderer = poppler.PageRenderer()
     full_page = renderer.render_page(page)
-    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1)
-    img.thumbnail((180,300), Image.BICUBIC)
+    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw',
+                           "BGRA", 0, 1)
+    img.thumbnail((180, 300), Image.BICUBIC)
     #img.thumbnail((360,600), Image.BICUBIC)
     img.save(outpath)
     #img.save(outpath, quality=95)
 
+
 if __name__ == '__main__':
     if len(sys.argv) != 3:
         print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 590b429..b79f316 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an unpaywall dump (JSON) into ingest requests.
 """
@@ -26,17 +25,19 @@ DOMAIN_BLOCKLIST = [
 ]
 
 RELEASE_STAGE_MAP = {
-    'draftVersion':     'draft',
+    'draftVersion': 'draft',
     'submittedVersion': 'submitted',
-    'acceptedVersion':  'accepted',
+    'acceptedVersion': 'accepted',
     'publishedVersion': 'published',
-    'updatedVersion':   'updated',
+    'updatedVersion': 'updated',
 }
 
+
 def canon(s):
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj):
     """
     Transforms from a single unpaywall object to zero or more ingest requests.
@@ -86,6 +87,7 @@ def transform(obj):
 
     return requests
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -96,17 +98,18 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="unpaywall dump file to use",
-        type=argparse.FileType('r'))
+                        help="unpaywall dump file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:54:37 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:54:37 -0700
commit	05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
tree	abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts
parent	f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
download	sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip