make fmt (black 21.9b0)

author: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
commit: 826c7538e091fac14d987a3cd654975da964e240 (patch)
tree: 90345b4cabb461c624ca5a218c2fc01dce3055cd /python/scripts
parent: 020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download: sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
18 files changed, 601 insertions, 525 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 9cc9055..4561541 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -21,43 +21,48 @@ def run(args):
         if not l.strip():
             continue
         row = json.loads(l)
-        if not row['hit']:
+        if not row["hit"]:
             continue
 
         request = {
-            'base_url': row['final_url'],
-            'ingest_type': args.ingest_type,
-            'link_source': args.link_source,
-            'link_source_id': row['identifier'],
-            'ingest_request_source': args.ingest_request_source,
-            'ext_ids': {
-                args.extid_type: row['identifier'],
+            "base_url": row["final_url"],
+            "ingest_type": args.ingest_type,
+            "link_source": args.link_source,
+            "link_source_id": row["identifier"],
+            "ingest_request_source": args.ingest_request_source,
+            "ext_ids": {
+                args.extid_type: row["identifier"],
             },
         }
         if args.release_stage:
-            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft',
-                                          'update')
-            request['release_stage'] = args.release_stage
+            assert args.release_stage in (
+                "published",
+                "submitted",
+                "accepted",
+                "draft",
+                "update",
+            )
+            request["release_stage"] = args.release_stage
 
         print("{}".format(json.dumps(request, sort_keys=True)))
 
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--link-source',
-                        required=True,
-                        help="link_source to include in request")
-    parser.add_argument('--extid-type', required=True, help="extid to encode identifier as")
-    parser.add_argument('--ingest-type',
-                        default="pdf",
-                        help="ingest type (pdf, html, xml, etc)")
-    parser.add_argument('--ingest-request-source',
-                        default="arabesque",
-                        help="to include in request")
-    parser.add_argument('--release-stage', default=None, help="to include in request")
-    parser.add_argument('json_file',
-                        help="arabesque output file to use",
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "--link-source", required=True, help="link_source to include in request"
+    )
+    parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
+    parser.add_argument(
+        "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
+    )
+    parser.add_argument(
+        "--ingest-request-source", default="arabesque", help="to include in request"
+    )
+    parser.add_argument("--release-stage", default=None, help="to include in request")
+    parser.add_argument(
+        "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
@@ -65,5 +70,5 @@ def main():
     run(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 83c04e3..6328f52 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -16,32 +16,31 @@ from typing import Any
 import internetarchive
 
 FORMAT_TO_MIMETYPE = {
-    'BZIP': 'application/x-bzip',
-    'BZIP2': 'application/x-bzip2',
-    'ZIP': 'application/zip',
-    'GZIP': 'application/gzip',
-    'RAR': 'application/vnd.rar',
-    'TAR': 'application/x-tar',
-    '7z': 'application/x-7z-compressed',
-    'HTML': 'text/html',
-    'Text': 'text/plain',
-    'PDF': 'application/pdf',
-    'CSV': 'text/csv',
-    'XML': 'application/xml',
-    'JSON': 'application/json',
-
+    "BZIP": "application/x-bzip",
+    "BZIP2": "application/x-bzip2",
+    "ZIP": "application/zip",
+    "GZIP": "application/gzip",
+    "RAR": "application/vnd.rar",
+    "TAR": "application/x-tar",
+    "7z": "application/x-7z-compressed",
+    "HTML": "text/html",
+    "Text": "text/plain",
+    "PDF": "application/pdf",
+    "CSV": "text/csv",
+    "XML": "application/xml",
+    "JSON": "application/json",
     #'application/msword (.doc)', # .doc
     #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
     #'application/vnd.ms-excel', # .xls
     #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
-    'MP3': 'audio/mpeg',  # .mp3
-    'MP4': 'video/mp4',  # .mp4
-    'MPEG': 'video/mpeg',  # .mpeg
-    'JPEG': 'image/jpeg',
-    'GIF': 'image/gif',
-    'PNG': 'image/png',
-    'TIFF': 'image/tiff',
-    'Unknown': None,
+    "MP3": "audio/mpeg",  # .mp3
+    "MP4": "video/mp4",  # .mp4
+    "MPEG": "video/mpeg",  # .mpeg
+    "JPEG": "image/jpeg",
+    "GIF": "image/gif",
+    "PNG": "image/png",
+    "TIFF": "image/tiff",
+    "Unknown": None,
 }
 
 
@@ -49,22 +48,22 @@ def want_file(f: dict, item_name: str) -> bool:
     """
     Filters IA API files
     """
-    if f.source != 'original':
+    if f.source != "original":
         return False
     for suffix in [
-            '_meta.sqlite',
-            '_archive.torrent',
-            '_itemimage.jpg',
-            '_meta.xml',
-            '_thumb.png',
-            '_files.xml',
+        "_meta.sqlite",
+        "_archive.torrent",
+        "_itemimage.jpg",
+        "_meta.xml",
+        "_thumb.png",
+        "_files.xml",
     ]:
         if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
             return False
-    if f.name.startswith('_'):
+    if f.name.startswith("_"):
         return False
-    if item_name.startswith('academictorrents_'):
-        for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
+    if item_name.startswith("academictorrents_"):
+        for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
             if f.name == item_name + suffix:
                 return False
     return True
@@ -77,36 +76,38 @@ def parse_file(f: dict) -> dict:
     assert f.name and f.sha1 and f.md5
     assert f.name is not None
     mf = {
-        'path': f.name,
-        'size': int(f.size),
-        'sha1': f.sha1,
-        'md5': f.md5,
+        "path": f.name,
+        "size": int(f.size),
+        "sha1": f.sha1,
+        "md5": f.md5,
     }
     # TODO: will disable this hard check eventually and replace with:
-    #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+    # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
     mimetype = FORMAT_TO_MIMETYPE[f.format]
     if mimetype:
-        mf['extra'] = dict(mimetype=mimetype)
+        mf["extra"] = dict(mimetype=mimetype)
     return mf
 
 
 def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
     print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
-    if release_id.startswith('release_'):
+    if release_id.startswith("release_"):
         release_id = release_id[9:]
     assert len(release_id) == 26
     item = session.get_item(item_name)
-    assert item.metadata['mediatype'] not in ['collection', 'web']
+    assert item.metadata["mediatype"] not in ["collection", "web"]
     item_files = item.get_files(on_the_fly=False)
     manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
     fileset = {
-        'manifest': manifest,
-        'urls': [{
-            'rel': 'archive',
-            'url': f'https://archive.org/download/{item_name}/',
-        }, ],
-        'release_ids': [release_id],
-        #extra={},
+        "manifest": manifest,
+        "urls": [
+            {
+                "rel": "archive",
+                "url": f"https://archive.org/download/{item_name}/",
+            },
+        ],
+        "release_ids": [release_id],
+        # extra={},
     }
     print(json.dumps(fileset))
     return fileset
@@ -123,12 +124,12 @@ def main():
             line = line.strip()
             if not line:
                 continue
-            fields = line.split('\t')
+            fields = line.split("\t")
             assert len(fields) == 2
             item_name = fields[0]
             release_id = fields[1]
             item_to_fileset(item_name, release_id=release_id, session=session)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index aa78aec..0b60da3 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -29,7 +29,7 @@ def run():
     collection = sys.argv[1]
 
     # Check collection name is clean
-    assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()
+    assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
 
     tempdir = tempfile.mkdtemp()
     print("Looking up collection: {}".format(collection))
@@ -45,15 +45,17 @@ def run():
     status = True
     errors = []
     for item in item_list:
-        item = item['identifier']
+        item = item["identifier"]
         # TODO: error handling
         try:
-            ret = ia.download(item,
-                              files=[item + '.cdx.gz'],
-                              verbose=True,
-                              destdir=tempdir,
-                              no_directory=True,
-                              retries=1000)
+            ret = ia.download(
+                item,
+                files=[item + ".cdx.gz"],
+                verbose=True,
+                destdir=tempdir,
+                no_directory=True,
+                retries=1000,
+            )
             status = ret and status
         except requests.exceptions.ReadTimeout as rt:
             print(str(rt), file=sys.stderr)
@@ -67,14 +69,14 @@ def run():
 
     # Combine files
     print("Merging and re-compressing all CDX files...")
-    #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
-    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True)
+    # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+    subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
 
     # Move and cleanup
-    shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection))
+    shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
 
     print("Done!")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 4714b60..e3bf4f0 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -18,38 +18,44 @@ def canon(s):
 def transform_cnki(obj):
 
     requests = []
-    assert obj['cnki_id']
+    assert obj["cnki_id"]
 
     requests = []
-    requests.append({
-        'base_url': canon(obj['info_url']),
-        'ingest_type': 'pdf',
-        'link_source': 'cnki_covid19',
-        'link_source_id': obj['cnki_id'],
-        'ingest_request_source': 'scrape-covid19',
-    })
-    if 'read_url' in obj:
-        requests.append({
-            'base_url': canon(obj['read_url']),
-            'ingest_type': 'pdf',  # actually HTML
-            'link_source': 'cnki_covid19',
-            'link_source_id': obj['cnki_id'],
-            'ingest_request_source': 'scrape-covid19',
-        })
+    requests.append(
+        {
+            "base_url": canon(obj["info_url"]),
+            "ingest_type": "pdf",
+            "link_source": "cnki_covid19",
+            "link_source_id": obj["cnki_id"],
+            "ingest_request_source": "scrape-covid19",
+        }
+    )
+    if "read_url" in obj:
+        requests.append(
+            {
+                "base_url": canon(obj["read_url"]),
+                "ingest_type": "pdf",  # actually HTML
+                "link_source": "cnki_covid19",
+                "link_source_id": obj["cnki_id"],
+                "ingest_request_source": "scrape-covid19",
+            }
+        )
 
     return requests
 
 
 def transform_wanfang(obj):
 
-    assert obj['wanfang_id']
-    return [{
-        'base_url': canon(obj['url']),
-        'ingest_type': 'pdf',
-        'link_source': 'wanfang_covid19',
-        'link_source_id': obj['wanfang_id'],
-        'ingest_request_source': 'scrape-covid19',
-    }]
+    assert obj["wanfang_id"]
+    return [
+        {
+            "base_url": canon(obj["url"]),
+            "ingest_type": "pdf",
+            "link_source": "wanfang_covid19",
+            "link_source_id": obj["wanfang_id"],
+            "ingest_request_source": "scrape-covid19",
+        }
+    ]
 
 
 def run(args):
@@ -58,9 +64,9 @@ def run(args):
             continue
         row = json.loads(l)
 
-        if 'wanfang_id' in row:
+        if "wanfang_id" in row:
             requests = transform_wanfang(row) or []
-        elif 'cnki_id' in row:
+        elif "cnki_id" in row:
             requests = transform_cnki(row) or []
         else:
             continue
@@ -70,9 +76,9 @@ def run(args):
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-                        help="COVID-19 metadata file to use",
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
@@ -80,5 +86,5 @@ def main():
     run(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 3b53235..3c769cf 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -45,38 +45,38 @@ def b32_hex(s):
         s = s[5:]
     if len(s) != 32:
         return s
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
 
 
-class DeliverDumpGrobidS3():
+class DeliverDumpGrobidS3:
     def __init__(self, s3_bucket, **kwargs):
         self.rstore = None
         self.count = Counter()
         self.s3_bucket = s3_bucket
-        self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
-        self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
-        self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
-        self.s3 = boto3.resource('s3')
+        self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+        self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+        self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+        self.s3 = boto3.resource("s3")
         self.bucket = self.s3.Bucket(self.s3_bucket)
 
     def run(self, dump_file):
         sys.stderr.write("Starting...\n")
         for line in dump_file:
-            line = line.strip().split('\t')
+            line = line.strip().split("\t")
             if len(line) != 2:
-                self.count['skip-line'] += 1
+                self.count["skip-line"] += 1
                 continue
             sha1_hex, grobid_json = line[0], line[1]
             if len(sha1_hex) != 40:
                 sha1_hex = b32_hex(sha1_hex)
             assert len(sha1_hex) == 40
             grobid = json.loads(grobid_json)
-            tei_xml = grobid.get('tei_xml')
+            tei_xml = grobid.get("tei_xml")
             if not tei_xml:
                 print("{}\tskip empty".format(sha1_hex))
-                self.count['skip-empty'] += 1
+                self.count["skip-empty"] += 1
                 continue
-            tei_xml = tei_xml.encode('utf-8')
+            tei_xml = tei_xml.encode("utf-8")
             # upload to AWS S3
             obj = self.bucket.put_object(
                 Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
@@ -84,7 +84,7 @@ class DeliverDumpGrobidS3():
                 StorageClass=self.s3_storage_class,
             )
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
-            self.count['success-s3'] += 1
+            self.count["success-s3"] += 1
         sys.stderr.write("{}\n".format(self.count))
 
 
@@ -92,31 +92,32 @@ class DeliverDumpGrobidS3():
 def main():
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--s3-bucket',
-                        required=True,
-                        type=str,
-                        help='AWS S3 bucket to upload into')
-    parser.add_argument('--s3-prefix',
-                        type=str,
-                        default="grobid/",
-                        help='key prefix for items created in bucket')
-    parser.add_argument('--s3-suffix',
-                        type=str,
-                        default=".tei.xml",
-                        help='file suffix for created objects')
-    parser.add_argument('--s3-storage-class',
-                        type=str,
-                        default="STANDARD",
-                        help='AWS S3 storage class (redundancy) to use')
-    parser.add_argument('dump_file',
-                        help="TSV/JSON dump file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+    )
+    parser.add_argument(
+        "--s3-prefix",
+        type=str,
+        default="grobid/",
+        help="key prefix for items created in bucket",
+    )
+    parser.add_argument(
+        "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+    )
+    parser.add_argument(
+        "--s3-storage-class",
+        type=str,
+        default="STANDARD",
+        help="AWS S3 storage class (redundancy) to use",
+    )
+    parser.add_argument(
+        "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+    )
     args = parser.parse_args()
 
     worker = DeliverDumpGrobidS3(**args.__dict__)
     worker.run(args.dump_file)
 
 
-if __name__ == '__main__':  # pragma: no cover
+if __name__ == "__main__":  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index ca19b97..fcaf51f 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -27,64 +27,76 @@ sentry_client = raven.Client()
 
 class DeliverGwbDisk:
     def __init__(self, disk_dir, **kwargs):
-        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+        self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
         self.rstore = None
         self.count = Counter()
         # /serve/ instead of /download/ doesn't record view count
-        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+        self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
-                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get(
+            "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+        )
         self.disk_dir = disk_dir
-        self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
-        self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+        self.disk_prefix = kwargs.get("disk_prefix", "pdf/")
+        self.disk_suffix = kwargs.get("disk_suffix", ".pdf")
 
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
             self.rstore = ResourceStore(
-                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
-                                               download_base_url=self.petabox_base_url))
+                loaderfactory=CDXLoaderFactory(
+                    webdata_secret=self.petabox_webdata_secret,
+                    download_base_url=self.petabox_base_url,
+                )
+            )
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
             return None, dict(
                 status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
             )
         except ValueError as ve:
             return None, dict(
                 status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".
-                format(ve))
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+                    ve
+                ),
+            )
         except EOFError as eofe:
             return None, dict(
                 status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".
-                format(eofe))
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+                    eofe
+                ),
+            )
         except TypeError as te:
             return None, dict(
                 status="error",
-                reason=
-                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
-                .format(te))
+                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+                    te
+                ),
+            )
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
-            return None, dict(status="error",
-                              reason="archived HTTP response (WARC) was not 200",
-                              warc_status=gwb_record.get_status()[0])
+            return None, dict(
+                status="error",
+                reason="archived HTTP response (WARC) was not 200",
+                warc_status=gwb_record.get_status()[0],
+            )
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
             return None, dict(
                 status="error",
-                reason=
-                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
-                format(ire))
+                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+                    ire
+                ),
+            )
         return raw_content, None
 
     def run(self, manifest_file):
@@ -95,47 +107,57 @@ class DeliverGwbDisk:
                 os.makedirs(fpath, exist_ok=True)
         sys.stderr.write("Starting...\n")
         for line in manifest_file:
-            self.count['total'] += 1
-            line = line.strip().split('\t')
+            self.count["total"] += 1
+            line = line.strip().split("\t")
             if len(line) != 2:
-                self.count['skip-line'] += 1
+                self.count["skip-line"] += 1
                 continue
             sha1_hex, cdx_json = line[0], line[1]
             assert len(sha1_hex) == 40
             file_cdx = json.loads(cdx_json)
             # If warc is not item/file.(w)arc.gz form, skip it
-            if len(file_cdx['warc'].split('/')) != 2:
-                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
-                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
-                self.count['skip-warc'] += 1
+            if len(file_cdx["warc"].split("/")) != 2:
+                sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+                self.count["skip-warc"] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
-                                                   file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(
+                file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+            )
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
-                                                         status['reason']))
-                self.count['err-petabox-fetch'] += 1
+                print(
+                    "{}\terror petabox\t{}\t{}".format(
+                        sha1_hex, file_cdx["warc"], status["reason"]
+                    )
+                )
+                self.count["err-petabox-fetch"] += 1
                 continue
             elif not blob:
                 print("{}\tskip-empty-blob".format(sha1_hex))
-                self.count['skip-empty-blob'] += 1
+                self.count["skip-empty-blob"] += 1
                 continue
             # verify sha1
             if sha1_hex != hashlib.sha1(blob).hexdigest():
-                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
-                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+                # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+                # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
                 print("{}\terror petabox-hash-mismatch".format(sha1_hex))
-                self.count['err-petabox-hash-mismatch'] += 1
+                self.count["err-petabox-hash-mismatch"] += 1
 
-            self.count['petabox-ok'] += 1
+            self.count["petabox-ok"] += 1
             # save to disk
-            fpath = "{}/{}{}/{}/{}{}".format(self.disk_dir, self.disk_prefix, sha1_hex[0:2],
-                                             sha1_hex[2:4], sha1_hex, self.disk_suffix)
-            with open(fpath, 'wb') as f:
+            fpath = "{}/{}{}/{}/{}{}".format(
+                self.disk_dir,
+                self.disk_prefix,
+                sha1_hex[0:2],
+                sha1_hex[2:4],
+                sha1_hex,
+                self.disk_suffix,
+            )
+            with open(fpath, "wb") as f:
                 f.write(blob)
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
-            self.count['success-disk'] += 1
+            self.count["success-disk"] += 1
         sys.stderr.write("{}\n".format(self.count))
 
 
@@ -143,31 +165,35 @@ class DeliverGwbDisk:
 def main():
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--disk-dir',
-                        required=True,
-                        type=str,
-                        help='local base directory to save into')
-    parser.add_argument('--disk-prefix',
-                        type=str,
-                        default="pdf/",
-                        help='directory prefix for items created in bucket')
-    parser.add_argument('--disk-suffix',
-                        type=str,
-                        default=".pdf",
-                        help='file suffix for created files')
-    parser.add_argument('--warc-uri-prefix',
-                        type=str,
-                        default='https://archive.org/serve/',
-                        help='URI where WARCs can be found')
-    parser.add_argument('manifest_file',
-                        help="TSV/JSON manifest file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "--disk-dir", required=True, type=str, help="local base directory to save into"
+    )
+    parser.add_argument(
+        "--disk-prefix",
+        type=str,
+        default="pdf/",
+        help="directory prefix for items created in bucket",
+    )
+    parser.add_argument(
+        "--disk-suffix", type=str, default=".pdf", help="file suffix for created files"
+    )
+    parser.add_argument(
+        "--warc-uri-prefix",
+        type=str,
+        default="https://archive.org/serve/",
+        help="URI where WARCs can be found",
+    )
+    parser.add_argument(
+        "manifest_file",
+        help="TSV/JSON manifest file",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
     args = parser.parse_args()
 
     worker = DeliverGwbDisk(**args.__dict__)
     worker.run(args.manifest_file)
 
 
-if __name__ == '__main__':  # pragma: no cover
+if __name__ == "__main__":  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index f9b3b19..1f08c4f 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -54,111 +54,128 @@ sentry_client = raven.Client()
 
 class DeliverGwbS3:
     def __init__(self, s3_bucket, **kwargs):
-        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+        self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
         self.rstore = None
         self.count = Counter()
         # /serve/ instead of /download/ doesn't record view count
-        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+        self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
-                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get(
+            "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+        )
         self.s3_bucket = s3_bucket
-        self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
-        self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
-        self.s3 = boto3.resource('s3')
+        self.s3_prefix = kwargs.get("s3_prefix", "pdf/")
+        self.s3_suffix = kwargs.get("s3_suffix", ".pdf")
+        self.s3 = boto3.resource("s3")
         self.bucket = self.s3.Bucket(self.s3_bucket)
 
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
             self.rstore = ResourceStore(
-                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
-                                               download_base_url=self.petabox_base_url))
+                loaderfactory=CDXLoaderFactory(
+                    webdata_secret=self.petabox_webdata_secret,
+                    download_base_url=self.petabox_base_url,
+                )
+            )
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
             return None, dict(
                 status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
             )
         except ValueError as ve:
             return None, dict(
                 status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".
-                format(ve))
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+                    ve
+                ),
+            )
         except EOFError as eofe:
             return None, dict(
                 status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".
-                format(eofe))
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+                    eofe
+                ),
+            )
         except TypeError as te:
             return None, dict(
                 status="error",
-                reason=
-                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
-                .format(te))
+                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+                    te
+                ),
+            )
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
-            return None, dict(status="error",
-                              reason="archived HTTP response (WARC) was not 200",
-                              warc_status=gwb_record.get_status()[0])
+            return None, dict(
+                status="error",
+                reason="archived HTTP response (WARC) was not 200",
+                warc_status=gwb_record.get_status()[0],
+            )
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
             return None, dict(
                 status="error",
-                reason=
-                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
-                format(ire))
+                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+                    ire
+                ),
+            )
         return raw_content, None
 
     def run(self, manifest_file):
         sys.stderr.write("Starting...\n")
         for line in manifest_file:
-            self.count['total'] += 1
-            line = line.strip().split('\t')
+            self.count["total"] += 1
+            line = line.strip().split("\t")
             if len(line) != 2:
-                self.count['skip-line'] += 1
+                self.count["skip-line"] += 1
                 continue
             sha1_hex, cdx_json = line[0], line[1]
             assert len(sha1_hex) == 40
             file_cdx = json.loads(cdx_json)
             # If warc is not item/file.(w)arc.gz form, skip it
-            if len(file_cdx['warc'].split('/')) != 2:
-                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
-                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
-                self.count['skip-warc'] += 1
+            if len(file_cdx["warc"].split("/")) != 2:
+                sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+                self.count["skip-warc"] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
-                                                   file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(
+                file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+            )
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
-                                                         status['reason']))
-                self.count['err-petabox-fetch'] += 1
+                print(
+                    "{}\terror petabox\t{}\t{}".format(
+                        sha1_hex, file_cdx["warc"], status["reason"]
+                    )
+                )
+                self.count["err-petabox-fetch"] += 1
                 continue
             elif not blob:
                 print("{}\tskip-empty-blob".format(sha1_hex))
-                self.count['skip-empty-blob'] += 1
+                self.count["skip-empty-blob"] += 1
                 continue
             # verify sha1
             if sha1_hex != hashlib.sha1(blob).hexdigest():
-                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
-                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+                # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+                # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
                 print("{}\terror petabox-hash-mismatch".format(sha1_hex))
-                self.count['err-petabox-hash-mismatch'] += 1
+                self.count["err-petabox-hash-mismatch"] += 1
 
-            self.count['petabox-ok'] += 1
+            self.count["petabox-ok"] += 1
             # upload to AWS S3
-            obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4],
-                                                                sha1_hex, self.s3_suffix),
-                                         Body=blob)
+            obj = self.bucket.put_object(
+                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
+                Body=blob,
+            )
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
-            self.count['success-s3'] += 1
+            self.count["success-s3"] += 1
         sys.stderr.write("{}\n".format(self.count))
 
 
@@ -166,31 +183,32 @@ class DeliverGwbS3:
 def main():
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--s3-bucket',
-                        required=True,
-                        type=str,
-                        help='AWS S3 bucket to upload into')
-    parser.add_argument('--s3-prefix',
-                        type=str,
-                        default="pdf/",
-                        help='key prefix for items created in bucket')
-    parser.add_argument('--s3-suffix',
-                        type=str,
-                        default=".pdf",
-                        help='file suffix for created objects')
-    parser.add_argument('--warc-uri-prefix',
-                        type=str,
-                        default='https://archive.org/serve/',
-                        help='URI where WARCs can be found')
-    parser.add_argument('manifest_file',
-                        help="TSV/JSON manifest file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+    )
+    parser.add_argument(
+        "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket"
+    )
+    parser.add_argument(
+        "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects"
+    )
+    parser.add_argument(
+        "--warc-uri-prefix",
+        type=str,
+        default="https://archive.org/serve/",
+        help="URI where WARCs can be found",
+    )
+    parser.add_argument(
+        "manifest_file",
+        help="TSV/JSON manifest file",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
     args = parser.parse_args()
 
     worker = DeliverGwbS3(**args.__dict__)
     worker.run(args.manifest_file)
 
 
-if __name__ == '__main__':  # pragma: no cover
+if __name__ == "__main__":  # pragma: no cover
     main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index 84a2c2c..67286b9 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -17,23 +17,21 @@ import urlcanon
 
 DOMAIN_BLOCKLIST = [
     # large OA publishers (we get via DOI)
-
     # large repos and aggregators (we crawl directly)
     "://arxiv.org/",
     "://europepmc.org/",
     "ncbi.nlm.nih.gov/",
-    #"semanticscholar.org/",
+    # "semanticscholar.org/",
     "://doi.org/",
     "zenodo.org/",
     "figshare.com/",
     "://archive.org/",
     ".archive.org/",
-
     # large publishers/platforms; may remove in the future
-    #"://link.springer.com/",
-    #"://dergipark.gov.tr/",
-    #"frontiersin.org/",
-    #"scielo",
+    # "://link.springer.com/",
+    # "://dergipark.gov.tr/",
+    # "frontiersin.org/",
+    # "scielo",
 ]
 
 # these default to PDF; note that we also do pdf ingests for HTML pages
@@ -63,35 +61,35 @@ def transform(obj: dict) -> List[dict]:
     Returns a list of dicts.
     """
 
-    doaj_id = obj['id'].lower()
+    doaj_id = obj["id"].lower()
     assert doaj_id
 
-    bibjson = obj['bibjson']
-    if not bibjson['link']:
+    bibjson = obj["bibjson"]
+    if not bibjson["link"]:
         return []
 
     requests = []
 
     doi: Optional[str] = None
-    for ident in (bibjson['identifier'] or []):
-        if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
-            doi = ident['id'].lower()
+    for ident in bibjson["identifier"] or []:
+        if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+            doi = ident["id"].lower()
 
-    for link in (bibjson['link'] or []):
-        if link.get('type') != "fulltext" or not link.get('url'):
+    for link in bibjson["link"] or []:
+        if link.get("type") != "fulltext" or not link.get("url"):
             continue
-        ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+        ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
         if not ingest_types:
             continue
 
         skip = False
         for domain in DOMAIN_BLOCKLIST:
-            if domain in link['url'].lower():
+            if domain in link["url"].lower():
                 skip = True
         if skip:
             continue
         try:
-            base_url = canon(link['url'].strip())
+            base_url = canon(link["url"].strip())
         except UnicodeEncodeError:
             continue
 
@@ -100,18 +98,18 @@ def transform(obj: dict) -> List[dict]:
 
         for ingest_type in ingest_types:
             request = {
-                'base_url': base_url,
-                'ingest_type': ingest_type,
-                'link_source': 'doaj',
-                'link_source_id': doaj_id,
-                'ingest_request_source': 'doaj',
-                'release_stage': 'published',
-                'rel': 'publisher',
-                'ext_ids': {
-                    'doi': doi,
-                    'doaj': doaj_id,
+                "base_url": base_url,
+                "ingest_type": ingest_type,
+                "link_source": "doaj",
+                "link_source_id": doaj_id,
+                "ingest_request_source": "doaj",
+                "release_stage": "published",
+                "rel": "publisher",
+                "ext_ids": {
+                    "doi": doi,
+                    "doaj": doaj_id,
                 },
-                'edit_extra': {},
+                "edit_extra": {},
             }
             requests.append(request)
 
@@ -131,9 +129,9 @@ def run(args) -> None:
 
 def main() -> None:
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-                        help="DOAJ article dump file to use",
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
@@ -141,5 +139,5 @@ def main() -> None:
     run(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 54c3d5f..44c091c 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -24,23 +24,25 @@ import sys
 
 def run():
     for line in sys.stdin:
-        line = line.split('\t')
+        line = line.split("\t")
         assert len(line) == 5
-        raw_sha1 = line[0].replace('sha1:', '')
+        raw_sha1 = line[0].replace("sha1:", "")
         dois = json.loads(line[1])
         cdx = json.loads(line[2])
         mimetype = line[3]
         size = int(line[4])
 
-        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
 
-        obj = dict(sha1=sha1,
-                   dois=dois,
-                   cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
-                   size=size,
-                   mimetype=mimetype)
+        obj = dict(
+            sha1=sha1,
+            dois=dois,
+            cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
+            size=size,
+            mimetype=mimetype,
+        )
         print(json.dumps(obj))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index a474393..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -3,39 +3,41 @@
 import json
 import sys
 
-with open('title_slug_denylist.txt', 'r') as f:
+with open("title_slug_denylist.txt", "r") as f:
     TITLE_DENYLIST = [l.strip() for l in f]
 
-TITLE_DENYLIST.extend((
-    'editorial',
-    'advertisement',
-    'bookreviews',
-    'reviews',
-    'nr',
-    'abstractoriginalarticle',
-    'originalarticle',
-    'impactfactor',
-    'articlenumber',
-))
+TITLE_DENYLIST.extend(
+    (
+        "editorial",
+        "advertisement",
+        "bookreviews",
+        "reviews",
+        "nr",
+        "abstractoriginalarticle",
+        "originalarticle",
+        "impactfactor",
+        "articlenumber",
+    )
+)
 
 # The full name can't *entirely* be one of these
 NAME_DENYLIST = (
-    'phd',
-    'phdstudent',
+    "phd",
+    "phdstudent",
 )
 
 
 def tokenize(s, remove_whitespace=True):
 
-    s.replace('&apos;', "'")
+    s.replace("&apos;", "'")
     # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+    s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
 
     if remove_whitespace:
-        s = ''.join(s.split())
+        s = "".join(s.split())
 
     # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+    return s.encode("ascii", "replace").decode("utf8").replace("?", "")
 
 
 assert tokenize("Impact Factor: 2.114") == "impactfactor"
@@ -50,14 +52,14 @@ def filter_title(title):
     title_slug = tokenize(title, remove_whitespace=True)
     if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
         return None
-    if title_slug.startswith('nr'):
+    if title_slug.startswith("nr"):
         return None
-    if title.lower().replace('.', '').startswith('int j '):
+    if title.lower().replace(".", "").startswith("int j "):
         return None
 
     for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
         if title.startswith(prefix):
-            title.replace(prefix, '')
+            title.replace(prefix, "")
 
     if title.startswith("The Journal of "):
         return None
@@ -81,17 +83,17 @@ def filter_title(title):
         return None
 
     # too deep subtitling/splitting
-    if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+    if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
         return None
 
     return title
 
 
 def filter_author_name(name):
-    name = name['name']
-    if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
+    name = name["name"]
+    if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
         return None
-    return ' '.join([t for t in name.split() if tokenize(t)])
+    return " ".join([t for t in name.split() if tokenize(t)])
 
 
 def filter_authors(l):
@@ -107,45 +109,58 @@ def filter_journal_name(name):
     # same denylist, for now
     if not name:
         return None
-    name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+    name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
     slug_name = tokenize(name)
     if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
         return None
-    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ",
-                   "Research Article ", "Available online www.jocpr.com "):
+    for prefix in (
+        "/ ",
+        "~ ",
+        "& ",
+        "© ",
+        "Original Research Article ",
+        "Original Article ",
+        "Research Article ",
+        "Available online www.jocpr.com ",
+    ):
         if name.startswith(prefix):
-            name = name.replace(prefix, '')
-    for suffix in (" Available online at www.sciarena.com", " Original Article",
-                   " Available online at", " ISSN", " ISSUE"):
+            name = name.replace(prefix, "")
+    for suffix in (
+        " Available online at www.sciarena.com",
+        " Original Article",
+        " Available online at",
+        " ISSN",
+        " ISSUE",
+    ):
         if name.endswith(suffix):
-            name = name.replace(suffix, '')
+            name = name.replace(suffix, "")
     if "====================" in name:
         return None
     if len(name) > 150:
         return None
-    return ' '.join(name.split())
+    return " ".join(name.split())
 
 
 def filter_metadata(obj):
-    if not (obj.get('title') and obj.get('authors')):
+    if not (obj.get("title") and obj.get("authors")):
         return None
 
-    title = filter_title(obj['title'])
+    title = filter_title(obj["title"])
     if not title:
-        #sys.stderr.write("bad title\n")
+        # sys.stderr.write("bad title\n")
         return None
     else:
-        obj['title'] = title
-    obj['authors'] = filter_authors(obj['authors'])
-    obj['citations'] = filter_refs(obj['citations'])
-    obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+        obj["title"] = title
+    obj["authors"] = filter_authors(obj["authors"])
+    obj["citations"] = filter_refs(obj["citations"])
+    obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
 
     return obj
 
 
 def run(invert=False):
     for line in sys.stdin:
-        fields = line.split('\t')
+        fields = line.split("\t")
         if len(fields) == 5:
             raw = fields[4]
         elif len(fields) == 1:
@@ -162,7 +177,7 @@ def run(invert=False):
                     fields[4] = processed
                 else:
                     fields[0] = processed
-                print('\t'.join(fields))
+                print("\t".join(fields))
         elif invert:
             print(raw.strip())
 
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index fda9098..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -31,15 +31,15 @@ REQUIRE_AUTHORS = False
 
 def tokenize(s, remove_whitespace=False):
 
-    s.replace('&apos;', "'")
+    s.replace("&apos;", "'")
     # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+    s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
 
     if remove_whitespace:
-        s = ''.join(s.split())
+        s = "".join(s.split())
 
     # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').replace(b'?', b'')
+    return s.encode("ascii", "replace").replace(b"?", b"")
 
 
 def check_authors(left, right):
@@ -53,7 +53,7 @@ def check_authors(left, right):
         return False
     right_all = tokenize(" ".join(right))
     for i in range(len(left)):
-        l = left[i].lower().replace('jr.', '').split()
+        l = left[i].lower().replace("jr.", "").split()
         if not l:
             return False
         l = tokenize(l[-1])
@@ -61,21 +61,21 @@ def check_authors(left, right):
             # weird author name (single char)
             return False
         if l not in right_all:
-            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+            # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
             return False
     return True
 
 
 def test_check_authors():
     assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
-    assert not check_authors([], ['one'])
-    assert check_authors(['one'], ['one'])
-    assert check_authors(['one two'], ['One Two'])
-    assert check_authors(['two'], ['One Two'])
-    assert check_authors(['two'], ['two, one'])
-    assert check_authors(['mago'], ['Mr. Magoo'])
-    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+    assert not check_authors([], ["one"])
+    assert check_authors(["one"], ["one"])
+    assert check_authors(["one two"], ["One Two"])
+    assert check_authors(["two"], ["One Two"])
+    assert check_authors(["two"], ["two, one"])
+    assert check_authors(["mago"], ["Mr. Magoo"])
+    assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+    assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
 
 
 # Rows are (score, left, right)
@@ -90,10 +90,10 @@ def process_group(rows):
         left = json.loads(row[1])
         right = json.loads(row[2])
         # authors must roughly match
-        if not check_authors(left['authors'], right['authors']):
+        if not check_authors(left["authors"], right["authors"]):
             continue
         # years must match (if defined)
-        if left['year'] and right['year'] and left['year'] != right['year']:
+        if left["year"] and right["year"] and left["year"] != right["year"]:
             continue
         filtered.append((left, right))
 
@@ -105,8 +105,8 @@ def process_group(rows):
     group_ids = set()
     for row in filtered[1:]:
         (left, right) = row
-        l_id = left['fatcat_release']
-        r_id = right['fatcat_release']
+        l_id = left["fatcat_release"]
+        r_id = right["fatcat_release"]
         releases[l_id] = left
         releases[r_id] = right
         if not group_ids:
@@ -131,7 +131,7 @@ def run():
 
     # group lines by slug, and process in batches
     for line in sys.stdin:
-        line = line.strip().split('\t')
+        line = line.strip().split("\t")
         assert len(line) == 4
         slug = line[0]
         if last_slug and slug != last_slug and lines:
@@ -146,5 +146,5 @@ def run():
         process_group(lines)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3251852..c5b7eef 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -23,15 +23,15 @@ require_authors = 1
 
 def tokenize(s, remove_whitespace=False):
 
-    s.replace('&apos;', "'")
+    s.replace("&apos;", "'")
     # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+    s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
 
     if remove_whitespace:
-        s = ''.join(s.split())
+        s = "".join(s.split())
 
     # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').replace(b'?', b'')
+    return s.encode("ascii", "replace").replace(b"?", b"")
 
 
 def check_authors(left, right):
@@ -45,7 +45,7 @@ def check_authors(left, right):
         return False
     right_all = tokenize(" ".join(right))
     for i in range(len(left)):
-        l = left[i].lower().replace('jr.', '').split()
+        l = left[i].lower().replace("jr.", "").split()
         if not l:
             return False
         l = tokenize(l[-1])
@@ -53,21 +53,21 @@ def check_authors(left, right):
             # weird author name (single char)
             return False
         if l not in right_all:
-            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+            # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
             return False
     return True
 
 
 def test_check_authors():
     assert not check_authors([], [])
-    assert not check_authors([], ['one'])
-    assert check_authors(['one'], ['one'])
-    assert check_authors(['one two'], ['One Two'])
-    assert check_authors(['two'], ['One Two'])
-    assert check_authors(['two'], ['two, one'])
-    assert check_authors(['mago'], ['Mr. Magoo'])
-    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+    assert not check_authors([], ["one"])
+    assert check_authors(["one"], ["one"])
+    assert check_authors(["one two"], ["One Two"])
+    assert check_authors(["two"], ["One Two"])
+    assert check_authors(["two"], ["two, one"])
+    assert check_authors(["mago"], ["Mr. Magoo"])
+    assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+    assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
 
 
 # Rows are (score, grobid, crossref)
@@ -81,14 +81,14 @@ def process_group(rows):
             continue
         grobid = json.loads(row[1])
         crossref = json.loads(row[2])
-        if not check_authors(crossref['authors'], grobid['authors']):
-            #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+        if not check_authors(crossref["authors"], grobid["authors"]):
+            # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
             continue
         else:
-            #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+            # print("YES: {} {}".format(crossref['authors'], grobid['authors']))
             pass
-        sha1 = grobid['sha1']
-        doi = crossref['doi'].lower()
+        sha1 = grobid["sha1"]
+        doi = crossref["doi"].lower()
         l = keepers.get(sha1, list())
         l.append(doi)
         keepers[sha1] = l
@@ -103,7 +103,7 @@ def run():
 
     # group lines by slug, and process in batches
     for line in sys.stdin:
-        line = line.strip().split('\t')
+        line = line.strip().split("\t")
         assert len(line) == 4
         slug = line[0]
         if last_slug and slug != last_slug and lines:
@@ -117,5 +117,5 @@ def run():
         process_group(lines)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index b42153c..b01e46a 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -16,40 +16,40 @@ from grobid2json import teixml2json
 
 
 def parse_hbase(line):
-    line = line.split('\t')
+    line = line.split("\t")
     assert len(line) == 2
     sha1hex = line[0]
     obj = json.loads(line[1])
-    tei_xml = obj['tei_xml']
+    tei_xml = obj["tei_xml"]
     return sha1hex, tei_xml
 
 
 def parse_pg(line):
     obj = json.loads(line)
-    return obj['sha1hex'], obj['tei_xml']
+    return obj["sha1hex"], obj["tei_xml"]
 
 
-def run(mode='hbase'):
+def run(mode="hbase"):
     for line in sys.stdin:
-        if mode == 'hbase':
+        if mode == "hbase":
             sha1hex, tei_xml = parse_hbase(line)
-        elif mode == 'pg':
+        elif mode == "pg":
             sha1hex, tei_xml = parse_pg(line)
         else:
-            raise NotImplementedError('parse mode: {}'.format(mode))
+            raise NotImplementedError("parse mode: {}".format(mode))
 
         obj = teixml2json(tei_xml, encumbered=False)
 
         affiliations = []
-        for author in obj['authors']:
-            if author.get('affiliation'):
-                affiliations.append(author['affiliation'])
+        for author in obj["authors"]:
+            if author.get("affiliation"):
+                affiliations.append(author["affiliation"])
         if affiliations:
             # don't duplicate affiliations; only the unique ones
             affiliations = list(set([json.dumps(a) for a in affiliations]))
             affiliations = [json.loads(a) for a in affiliations]
-            print('\t'.join([sha1hex, json.dumps(affiliations)]))
+            print("\t".join([sha1hex, json.dumps(affiliations)]))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index c9bc134..f941881 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -9,59 +9,59 @@ MAX_ABSTRACT_BYTES = 4096
 
 def parse_grobid_json(obj):
 
-    if not obj.get('title'):
+    if not obj.get("title"):
         return None
 
     extra = dict()
 
-    if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
-        abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip())
+    if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
+        abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
         abstracts = [abobj]
     else:
         abstracts = None
 
     contribs = []
-    for a in obj.get('authors', []):
+    for a in obj.get("authors", []):
         c = dict(raw_name=a, role="author")
         contribs.append(c)
 
     refs = []
-    for raw in obj.get('citations', []):
+    for raw in obj.get("citations", []):
         extra = dict()
         ref = dict()
-        ref['key'] = raw.get('id')
-        if raw.get('title'):
-            ref['title'] = raw['title'].strip()
-        if raw.get('date'):
+        ref["key"] = raw.get("id")
+        if raw.get("title"):
+            ref["title"] = raw["title"].strip()
+        if raw.get("date"):
             try:
-                year = int(raw['date'].strip()[:4])
-                ref['year'] = year
+                year = int(raw["date"].strip()[:4])
+                ref["year"] = year
             except:
                 pass
-        for key in ('volume', 'url', 'issue', 'publisher'):
+        for key in ("volume", "url", "issue", "publisher"):
             if raw.get(key):
                 extra[key] = raw[key].strip()
-        if raw.get('authors'):
-            extra['authors'] = [a['name'] for a in raw['authors']]
+        if raw.get("authors"):
+            extra["authors"] = [a["name"] for a in raw["authors"]]
         if extra:
             extra = dict(grobid=extra)
         else:
             extra = None
-        ref['extra'] = extra
+        ref["extra"] = extra
         refs.append(ref)
 
     release_type = "journal-article"
     release_date = None
-    if obj.get('date'):
+    if obj.get("date"):
         # TODO: only returns year, ever? how to handle?
-        release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+        release_date = datetime.datetime(year=obj["date"], month=1, day=1)
 
-    if obj.get('doi'):
-        extra['doi'] = obj['doi'].lower()
-    if obj['journal'].get('name'):
-        extra['container_name'] = obj['journal']['name']
+    if obj.get("doi"):
+        extra["doi"] = obj["doi"].lower()
+    if obj["journal"].get("name"):
+        extra["container_name"] = obj["journal"]["name"]
 
-    extra['is_longtail_oa'] = True
+    extra["is_longtail_oa"] = True
 
     # TODO: ISSN/eISSN handling? or just journal name lookup?
 
@@ -70,15 +70,17 @@ def parse_grobid_json(obj):
     else:
         extra = None
 
-    return dict(title=obj['title'].strip(),
-                contribs=contribs,
-                publisher=obj['journal'].get('publisher'),
-                volume=obj['journal'].get('volume'),
-                issue=obj['journal'].get('issue'),
-                abstracts=abstracts,
-                release_type=release_type,
-                release_date=release_date,
-                extra=extra)
+    return dict(
+        title=obj["title"].strip(),
+        contribs=contribs,
+        publisher=obj["journal"].get("publisher"),
+        volume=obj["journal"].get("volume"),
+        issue=obj["journal"].get("issue"),
+        abstracts=abstracts,
+        release_type=release_type,
+        release_date=release_date,
+        extra=extra,
+    )
 
 
 def run():
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 70731d5..d52e793 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -15,13 +15,13 @@ def transform(row):
     """
     dict-to-dict
     """
-    row.pop('created', None)
-    extra = row.pop('request', None) or {}
-    for k in ('ext_ids', 'edit_extra'):
+    row.pop("created", None)
+    extra = row.pop("request", None) or {}
+    for k in ("ext_ids", "edit_extra"):
         if k in extra:
             row[k] = extra[k]
-    if 'release_ident' in extra:
-        row['fatcat'] = dict(release_ident=extra['release_ident'])
+    if "release_ident" in extra:
+        row["fatcat"] = dict(release_ident=extra["release_ident"])
     return row
 
 
@@ -38,9 +38,9 @@ def run(args):
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-                        help="arabesque output file to use",
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
@@ -48,5 +48,5 @@ def main():
     run(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 1f4a19f..9607b85 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -13,7 +13,6 @@ import urlcanon
 
 DOMAIN_BLOCKLIST = [
     # large OA publishers (we get via DOI)
-
     # large repos and aggregators (we crawl directly)
     "://arxiv.org/",
     "://europepmc.org/",
@@ -26,17 +25,16 @@ DOMAIN_BLOCKLIST = [
     "://archive.org/",
     ".archive.org/",
     "://127.0.0.1/",
-
     # OAI specific additions
     "://hdl.handle.net/",
 ]
 
 RELEASE_STAGE_MAP = {
-    'info:eu-repo/semantics/draftVersion': 'draft',
-    'info:eu-repo/semantics/submittedVersion': 'submitted',
-    'info:eu-repo/semantics/acceptedVersion': 'accepted',
-    'info:eu-repo/semantics/publishedVersion': 'published',
-    'info:eu-repo/semantics/updatedVersion': 'updated',
+    "info:eu-repo/semantics/draftVersion": "draft",
+    "info:eu-repo/semantics/submittedVersion": "submitted",
+    "info:eu-repo/semantics/acceptedVersion": "accepted",
+    "info:eu-repo/semantics/publishedVersion": "published",
+    "info:eu-repo/semantics/updatedVersion": "updated",
 }
 
 
@@ -52,38 +50,38 @@ def transform(obj):
     """
 
     requests = []
-    if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+    if not obj.get("oai") or not obj["oai"].startswith("oai:"):
         return []
-    if not obj.get('urls'):
+    if not obj.get("urls"):
         return []
 
     # look in obj['formats'] for PDF?
-    if obj.get('formats'):
+    if obj.get("formats"):
         # if there is a list of formats, and it does not contain PDF, then
         # skip. Note that we will continue if there is no formats list.
         has_pdf = False
-        for f in obj['formats']:
-            if 'pdf' in f.lower():
+        for f in obj["formats"]:
+            if "pdf" in f.lower():
                 has_pdf = True
         if not has_pdf:
             return []
 
     doi = None
-    if obj.get('doi'):
-        doi = obj['doi'][0].lower().strip()
-        if not doi.startswith('10.'):
+    if obj.get("doi"):
+        doi = obj["doi"][0].lower().strip()
+        if not doi.startswith("10."):
             doi = None
 
     # infer release stage and/or type from obj['types']
     release_stage = None
-    for t in obj.get('types', []):
+    for t in obj.get("types", []):
         if t in RELEASE_STAGE_MAP:
             release_stage = RELEASE_STAGE_MAP[t]
 
     # TODO: infer rel somehow? Eg, repository vs. OJS publisher
     rel = None
 
-    for url in obj['urls']:
+    for url in obj["urls"]:
         skip = False
         for domain in DOMAIN_BLOCKLIST:
             if domain in url:
@@ -96,18 +94,18 @@ def transform(obj):
             continue
 
         request = {
-            'base_url': base_url,
-            'ingest_type': 'pdf',
-            'link_source': 'oai',
-            'link_source_id': obj['oai'].lower(),
-            'ingest_request_source': 'metha-bulk',
-            'release_stage': release_stage,
-            'rel': rel,
-            'ext_ids': {
-                'doi': doi,
-                'oai': obj['oai'].lower(),
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "oai",
+            "link_source_id": obj["oai"].lower(),
+            "ingest_request_source": "metha-bulk",
+            "release_stage": release_stage,
+            "rel": rel,
+            "ext_ids": {
+                "doi": doi,
+                "oai": obj["oai"].lower(),
             },
-            'edit_extra': {},
+            "edit_extra": {},
         }
         requests.append(request)
 
@@ -127,9 +125,11 @@ def run(args):
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-                        help="OAI-PMH dump file to use (usually stdin)",
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "json_file",
+        help="OAI-PMH dump file to use (usually stdin)",
+        type=argparse.FileType("r"),
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
@@ -137,5 +137,5 @@ def main():
     run(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index 3f81b3b..8b57c5b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -22,15 +22,16 @@ def run(inpath, outpath):
 
     renderer = poppler.PageRenderer()
     full_page = renderer.render_page(page)
-    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw',
-                           "BGRA", 0, 1)
+    img = Image.frombuffer(
+        "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
+    )
     img.thumbnail((180, 300), Image.BICUBIC)
-    #img.thumbnail((360,600), Image.BICUBIC)
+    # img.thumbnail((360,600), Image.BICUBIC)
     img.save(outpath)
-    #img.save(outpath, quality=95)
+    # img.save(outpath, quality=95)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     if len(sys.argv) != 3:
         print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
         sys.exit(-1)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index b79f316..ad5353b 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -11,7 +11,6 @@ import urlcanon
 
 DOMAIN_BLOCKLIST = [
     # large OA publishers (we get via DOI)
-
     # large repos and aggregators (we crawl directly)
     "://arxiv.org/",
     "://europepmc.org/",
@@ -25,11 +24,11 @@ DOMAIN_BLOCKLIST = [
 ]
 
 RELEASE_STAGE_MAP = {
-    'draftVersion': 'draft',
-    'submittedVersion': 'submitted',
-    'acceptedVersion': 'accepted',
-    'publishedVersion': 'published',
-    'updatedVersion': 'updated',
+    "draftVersion": "draft",
+    "submittedVersion": "submitted",
+    "acceptedVersion": "accepted",
+    "publishedVersion": "published",
+    "updatedVersion": "updated",
 }
 
 
@@ -45,44 +44,44 @@ def transform(obj):
     """
 
     requests = []
-    if not obj['doi'].startswith('10.'):
+    if not obj["doi"].startswith("10."):
         return requests
-    if not obj['oa_locations']:
+    if not obj["oa_locations"]:
         return requests
 
-    for location in obj['oa_locations']:
-        if not location['url_for_pdf']:
+    for location in obj["oa_locations"]:
+        if not location["url_for_pdf"]:
             continue
         skip = False
         for domain in DOMAIN_BLOCKLIST:
-            if domain in location['url_for_pdf']:
+            if domain in location["url_for_pdf"]:
                 skip = True
         if skip:
             continue
         try:
-            base_url = canon(location['url_for_pdf'])
+            base_url = canon(location["url_for_pdf"])
         except UnicodeEncodeError:
             continue
 
         request = {
-            'base_url': base_url,
-            'ingest_type': 'pdf',
-            'link_source': 'unpaywall',
-            'link_source_id': obj['doi'].lower(),
-            'ingest_request_source': 'unpaywall',
-            'release_stage': RELEASE_STAGE_MAP.get(location['version']),
-            'rel': location['host_type'],
-            'ext_ids': {
-                'doi': obj['doi'].lower(),
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "unpaywall",
+            "link_source_id": obj["doi"].lower(),
+            "ingest_request_source": "unpaywall",
+            "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+            "rel": location["host_type"],
+            "ext_ids": {
+                "doi": obj["doi"].lower(),
             },
-            'edit_extra': {},
+            "edit_extra": {},
         }
-        if obj.get('oa_status'):
-            request['edit_extra']['oa_status'] = obj['oa_status']
-        if location.get('evidence'):
-            request['edit_extra']['evidence'] = location['evidence']
-        if location['pmh_id']:
-            request['ext_ids']['pmh_id'] = location['pmh_id']
+        if obj.get("oa_status"):
+            request["edit_extra"]["oa_status"] = obj["oa_status"]
+        if location.get("evidence"):
+            request["edit_extra"]["evidence"] = location["evidence"]
+        if location["pmh_id"]:
+            request["ext_ids"]["pmh_id"] = location["pmh_id"]
         requests.append(request)
 
     return requests
@@ -101,9 +100,9 @@ def run(args):
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-                        help="unpaywall dump file to use",
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
@@ -111,5 +110,5 @@ def main():
     run(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
author	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
commit	826c7538e091fac14d987a3cd654975da964e240 (patch)
tree	90345b4cabb461c624ca5a218c2fc01dce3055cd /python/scripts
parent	020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download	sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip