59 files changed, 1582 insertions, 1225 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index b4bfe2b..0d47f36 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 NB: adapted to work as a library for PDF extraction. Will probably be
 re-written eventually to be correct, complete, and robust; this is just a
@@ -76,9 +75,7 @@ def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]:
 def journal_info(elem: ET.Element) -> Dict[str, Any]:
     journal = dict()
     journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
-    journal["publisher"] = elem.findtext(
-        ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
-    )
+    journal["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
     if journal["publisher"] == "":
         journal["publisher"] = None
     journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
@@ -145,9 +142,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
     info["grobid_version"] = application_tag.attrib["version"].strip()
     info["grobid_timestamp"] = application_tag.attrib["when"].strip()
     info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
-    info["authors"] = all_authors(
-        header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
-    )
+    info["authors"] = all_authors(header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns)))
     info["journal"] = journal_info(header)
     date = header.find('.//{%s}date[@type="published"]' % ns)
     info["date"] = (date is not None) and date.attrib.get("when")
@@ -207,8 +202,7 @@ def main() -> None:  # pragma no cover
             json.dumps(
                 teixml2json(content, encumbered=(not args.no_encumbered)),
                 sort_keys=True,
-            )
-        )
+            ))
 
 
 if __name__ == "__main__":  # pragma no cover
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 0084330..4ba9540 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 These are generally for running one-off tasks from the command line. Output
 might go to stdout, or might go to Kafka topic.
@@ -30,6 +29,7 @@ def run_extract_json(args):
         pusher = JsonLinePusher(worker, args.json_file)
     pusher.run()
 
+
 def run_extract_cdx(args):
     grobid_client = GrobidClient(host_url=args.grobid_host)
     wayback_client = WaybackClient()
@@ -53,6 +53,7 @@ def run_extract_cdx(args):
         )
     pusher.run()
 
+
 def run_extract_zipfile(args):
     grobid_client = GrobidClient(host_url=args.grobid_host)
     if args.jobs > 1:
@@ -65,6 +66,7 @@ def run_extract_zipfile(args):
         pusher = ZipfilePusher(worker, args.zip_file)
     pusher.run()
 
+
 def run_transform(args):
     grobid_client = GrobidClient()
     for line in args.json_file:
@@ -82,52 +84,54 @@ def run_transform(args):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--kafka-mode',
-        action='store_true',
-        help="send output to Kafka (not stdout)")
+                        action='store_true',
+                        help="send output to Kafka (not stdout)")
     parser.add_argument('--kafka-hosts',
-        default="localhost:9092",
-        help="list of Kafka brokers (host/port) to use")
+                        default="localhost:9092",
+                        help="list of Kafka brokers (host/port) to use")
     parser.add_argument('--kafka-env',
-        default="dev",
-        help="Kafka topic namespace to use (eg, prod, qa, dev)")
-    parser.add_argument('-j', '--jobs',
-        default=8, type=int,
-        help="parallelism for batch CPU jobs")
+                        default="dev",
+                        help="Kafka topic namespace to use (eg, prod, qa, dev)")
+    parser.add_argument('-j',
+                        '--jobs',
+                        default=8,
+                        type=int,
+                        help="parallelism for batch CPU jobs")
     parser.add_argument('--grobid-host',
-        default="http://grobid.qa.fatcat.wiki",
-        help="GROBID API host/port")
+                        default="http://grobid.qa.fatcat.wiki",
+                        help="GROBID API host/port")
     subparsers = parser.add_subparsers()
 
-    sub_extract_json = subparsers.add_parser('extract-json',
+    sub_extract_json = subparsers.add_parser(
+        'extract-json',
         help="for each JSON line with CDX info, fetches PDF and does GROBID extraction")
     sub_extract_json.set_defaults(func=run_extract_json)
     sub_extract_json.add_argument('json_file',
-        help="JSON file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                                  help="JSON file to import from (or '-' for stdin)",
+                                  type=argparse.FileType('r'))
 
-    sub_extract_cdx = subparsers.add_parser('extract-cdx',
-        help="for each CDX line, fetches PDF and does GROBID extraction")
+    sub_extract_cdx = subparsers.add_parser(
+        'extract-cdx', help="for each CDX line, fetches PDF and does GROBID extraction")
     sub_extract_cdx.set_defaults(func=run_extract_cdx)
     sub_extract_cdx.add_argument('cdx_file',
-        help="CDX file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                                 help="CDX file to import from (or '-' for stdin)",
+                                 type=argparse.FileType('r'))
 
-    sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
+    sub_extract_zipfile = subparsers.add_parser(
+        'extract-zipfile',
         help="opens zipfile, iterates over PDF files inside and does GROBID extract for each")
     sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
-    sub_extract_zipfile.add_argument('zip_file',
-        help="zipfile with PDFs to extract",
-        type=str)
+    sub_extract_zipfile.add_argument('zip_file', help="zipfile with PDFs to extract", type=str)
 
     sub_transform = subparsers.add_parser('transform')
     sub_transform.set_defaults(func=run_transform)
     sub_transform.add_argument('--metadata-only',
-        action='store_true',
-        help="Only pass through bibliographic metadata, not fulltext")
-    sub_transform.add_argument('json_file',
+                               action='store_true',
+                               help="Only pass through bibliographic metadata, not fulltext")
+    sub_transform.add_argument(
+        'json_file',
         help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
         type=argparse.FileType('r'))
 
@@ -140,10 +144,10 @@ def main():
     if args.kafka_mode:
         produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
         print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
-        args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
-            produce_topic=produce_topic)
+        args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
 
     args.func(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 137110c..c3d9c16 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Input is IA item metadata JSON.
 Ouput is insertable fatcat "match" JSON
@@ -81,21 +80,23 @@ def parse(obj):
         'size': int(pdf_file['size']),
         'mimetype': 'application/pdf',
         'urls': [
-            "https://archive.org/download/{}/{}".format(
-                obj['metadata']['identifier'],
-                pdf_file['name']),
+            "https://archive.org/download/{}/{}".format(obj['metadata']['identifier'],
+                                                        pdf_file['name']),
         ],
         'cdx': [],
         'dois': [],
     }
 
     if extid_type == 'doi':
-        match['dois'] = [extid,]
+        match['dois'] = [
+            extid,
+        ]
     else:
         match[extid_type] = extid
 
     return match
 
+
 def run():
     for line in sys.stdin:
         if not line:
@@ -105,5 +106,6 @@ def run():
         if match:
             print(json.dumps(match, sort_keys=True))
 
+
 if __name__ == '__main__':
     run()
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index c0ef5aa..305c3a8 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -18,7 +18,9 @@ def run_single_ingest(args):
     )
     if args.force_recrawl:
         request['force_recrawl'] = True
-    if request['ingest_type'] in ['dataset',]:
+    if request['ingest_type'] in [
+            'dataset',
+    ]:
         ingester = IngestFilesetWorker(
             try_spn2=not args.no_spn2,
             ingest_file_result_stdout=True,
@@ -32,75 +34,71 @@ def run_single_ingest(args):
     print(json.dumps(result, sort_keys=True))
     return result
 
+
 def run_requests(args):
     # TODO: switch to using JsonLinePusher
     file_worker = IngestFileWorker(
         try_spn2=not args.no_spn2,
         html_quick_mode=args.html_quick_mode,
     )
-    fileset_worker = IngestFilesetWorker(
-        try_spn2=not args.no_spn2,
-    )
+    fileset_worker = IngestFilesetWorker(try_spn2=not args.no_spn2, )
     for l in args.json_file:
         request = json.loads(l.strip())
-        if request['ingest_type'] in ['dataset',]:
+        if request['ingest_type'] in [
+                'dataset',
+        ]:
             result = fileset_worker.process(request)
         else:
             result = file_worker.process(request)
         print(json.dumps(result, sort_keys=True))
 
+
 def run_api(args):
     port = 8083
     print("Listening on localhost:{}".format(port))
     server = HTTPServer(('', port), IngestFileRequestHandler)
     server.serve_forever()
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     subparsers = parser.add_subparsers()
 
-    sub_single= subparsers.add_parser('single',
-        help="ingests a single base URL")
+    sub_single = subparsers.add_parser('single', help="ingests a single base URL")
     sub_single.set_defaults(func=run_single_ingest)
     sub_single.add_argument('ingest_type',
-        default="pdf",
-        help="type of ingest (pdf, html, etc)")
+                            default="pdf",
+                            help="type of ingest (pdf, html, etc)")
     sub_single.add_argument('--release-id',
-        help="(optional) existing release ident to match to")
-    sub_single.add_argument('--doi',
-        help="(optional) existing release DOI to match to")
+                            help="(optional) existing release ident to match to")
+    sub_single.add_argument('--doi', help="(optional) existing release DOI to match to")
     sub_single.add_argument('--force-recrawl',
-        action='store_true',
-        help="ignore GWB history and use SPNv2 to re-crawl")
-    sub_single.add_argument('--no-spn2',
-        action='store_true',
-        help="don't use live web (SPNv2)")
+                            action='store_true',
+                            help="ignore GWB history and use SPNv2 to re-crawl")
+    sub_single.add_argument('--no-spn2', action='store_true', help="don't use live web (SPNv2)")
     sub_single.add_argument('--html-quick-mode',
-        action='store_true',
-        help="don't fetch individual sub-resources, just use CDX")
-    sub_single.add_argument('url',
-        help="URL of paper to fetch")
+                            action='store_true',
+                            help="don't fetch individual sub-resources, just use CDX")
+    sub_single.add_argument('url', help="URL of paper to fetch")
 
-    sub_requests = subparsers.add_parser('requests',
-        help="takes a series of ingest requests (JSON, per line) and runs each")
+    sub_requests = subparsers.add_parser(
+        'requests', help="takes a series of ingest requests (JSON, per line) and runs each")
     sub_requests.add_argument('--no-spn2',
-        action='store_true',
-        help="don't use live web (SPNv2)")
+                              action='store_true',
+                              help="don't use live web (SPNv2)")
     sub_requests.add_argument('--html-quick-mode',
-        action='store_true',
-        help="don't fetch individual sub-resources, just use CDX")
+                              action='store_true',
+                              help="don't fetch individual sub-resources, just use CDX")
     sub_requests.set_defaults(func=run_requests)
     sub_requests.add_argument('json_file',
-        help="JSON file (request per line) to import from (or stdin)",
-        default=sys.stdin, type=argparse.FileType('r'))
+                              help="JSON file (request per line) to import from (or stdin)",
+                              default=sys.stdin,
+                              type=argparse.FileType('r'))
 
-    sub_api = subparsers.add_parser('api',
-        help="starts a simple HTTP server that processes ingest requests")
+    sub_api = subparsers.add_parser(
+        'api', help="starts a simple HTTP server that processes ingest requests")
     sub_api.set_defaults(func=run_api)
-    sub_api.add_argument('--port',
-        help="HTTP port to listen on",
-        default=8033, type=int)
+    sub_api.add_argument('--port', help="HTTP port to listen on", default=8033, type=int)
 
     args = parser.parse_args()
     if not args.__dict__.get("func"):
@@ -109,5 +107,6 @@ def main():
 
     args.func(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 89ecf1c..717b743 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
 """
@@ -20,10 +19,13 @@ def run_extract_json(args):
         multi_worker = MultiprocessWrapper(worker, args.sink)
         pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
     else:
-        worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+        worker = PdfExtractWorker(wayback_client,
+                                  sink=args.sink,
+                                  thumbnail_sink=args.thumbnail_sink)
         pusher = JsonLinePusher(worker, args.json_file)
     pusher.run()
 
+
 def run_extract_cdx(args):
     wayback_client = WaybackClient()
     if args.jobs > 1:
@@ -37,7 +39,9 @@ def run_extract_cdx(args):
             batch_size=args.jobs,
         )
     else:
-        worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+        worker = PdfExtractWorker(wayback_client,
+                                  sink=args.sink,
+                                  thumbnail_sink=args.thumbnail_sink)
         pusher = CdxLinePusher(
             worker,
             args.cdx_file,
@@ -46,6 +50,7 @@ def run_extract_cdx(args):
         )
     pusher.run()
 
+
 def run_extract_zipfile(args):
     if args.jobs > 1:
         print("multi-processing: {}".format(args.jobs), file=sys.stderr)
@@ -57,6 +62,7 @@ def run_extract_zipfile(args):
         pusher = ZipfilePusher(worker, args.zip_file)
     pusher.run()
 
+
 def run_single(args):
     worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
     with open(args.pdf_file, 'rb') as pdf_file:
@@ -67,51 +73,48 @@ def run_single(args):
         args.thumbnail_sink.finish()
 
 
-
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--kafka-mode',
-        action='store_true',
-        help="send output to Kafka (not stdout)")
+                        action='store_true',
+                        help="send output to Kafka (not stdout)")
     parser.add_argument('--kafka-hosts',
-        default="localhost:9092",
-        help="list of Kafka brokers (host/port) to use")
+                        default="localhost:9092",
+                        help="list of Kafka brokers (host/port) to use")
     parser.add_argument('--kafka-env',
-        default="dev",
-        help="Kafka topic namespace to use (eg, prod, qa, dev)")
-    parser.add_argument('-j', '--jobs',
-        default=8, type=int,
-        help="parallelism for batch CPU jobs")
+                        default="dev",
+                        help="Kafka topic namespace to use (eg, prod, qa, dev)")
+    parser.add_argument('-j',
+                        '--jobs',
+                        default=8,
+                        type=int,
+                        help="parallelism for batch CPU jobs")
     subparsers = parser.add_subparsers()
 
-    sub_extract_json = subparsers.add_parser('extract-json',
+    sub_extract_json = subparsers.add_parser(
+        'extract-json',
         help="for each JSON line with CDX info, fetches PDF and does PDF extraction")
     sub_extract_json.set_defaults(func=run_extract_json)
     sub_extract_json.add_argument('json_file',
-        help="JSON file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                                  help="JSON file to import from (or '-' for stdin)",
+                                  type=argparse.FileType('r'))
 
-    sub_extract_cdx = subparsers.add_parser('extract-cdx',
-        help="for each CDX line, fetches PDF and does PDF extraction")
+    sub_extract_cdx = subparsers.add_parser(
+        'extract-cdx', help="for each CDX line, fetches PDF and does PDF extraction")
     sub_extract_cdx.set_defaults(func=run_extract_cdx)
     sub_extract_cdx.add_argument('cdx_file',
-        help="CDX file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                                 help="CDX file to import from (or '-' for stdin)",
+                                 type=argparse.FileType('r'))
 
-    sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
+    sub_extract_zipfile = subparsers.add_parser(
+        'extract-zipfile',
         help="opens zipfile, iterates over PDF files inside and does PDF extract for each")
     sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
-    sub_extract_zipfile.add_argument('zip_file',
-        help="zipfile with PDFs to extract",
-        type=str)
+    sub_extract_zipfile.add_argument('zip_file', help="zipfile with PDFs to extract", type=str)
 
-    sub_single = subparsers.add_parser('single',
-        help="opens single PDF and extracts it")
+    sub_single = subparsers.add_parser('single', help="opens single PDF and extracts it")
     sub_single.set_defaults(func=run_single)
-    sub_single.add_argument('pdf_file',
-        help="single PDF to extract",
-        type=str)
+    sub_single.add_argument('pdf_file', help="single PDF to extract", type=str)
 
     args = parser.parse_args()
     if not args.__dict__.get("func"):
@@ -123,17 +126,18 @@ def main():
     if args.kafka_mode:
         text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
         thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
-        args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
-            produce_topic=text_topic)
+        args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=text_topic)
         args.thumbnail_sink = KafkaSink(kafka_hosts=args.kafka_hosts,
-            produce_topic=thumbnail_topic)
+                                        produce_topic=thumbnail_topic)
         print("Running in kafka output mode, publishing to {} and {}\n".format(
-            text_topic, thumbnail_topic), file=sys.stderr)
+            text_topic, thumbnail_topic),
+              file=sys.stderr)
     else:
         args.sink = None
         args.thumbnail_sink = None
 
     args.func(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index e195bc7..9316313 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Basically just a copy of grobid_tool.py, but for PDF classification instead of
 text extraction.
@@ -21,19 +20,29 @@ def run_classify_pdf_json(args):
     pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
     wayback_client = WaybackClient()
     if args.jobs > 1:
-        worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+        worker = PdfTrioWorker(pdftrio_client,
+                               wayback_client,
+                               sink=None,
+                               mode=args.pdftrio_mode)
         multi_worker = MultiprocessWrapper(worker, args.sink)
         pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
     else:
-        worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+        worker = PdfTrioWorker(pdftrio_client,
+                               wayback_client,
+                               sink=args.sink,
+                               mode=args.pdftrio_mode)
         pusher = JsonLinePusher(worker, args.json_file)
     pusher.run()
 
+
 def run_classify_pdf_cdx(args):
     pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
     wayback_client = WaybackClient()
     if args.jobs > 1:
-        worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+        worker = PdfTrioWorker(pdftrio_client,
+                               wayback_client,
+                               sink=None,
+                               mode=args.pdftrio_mode)
         multi_worker = MultiprocessWrapper(worker, args.sink)
         pusher = CdxLinePusher(
             multi_worker,
@@ -43,7 +52,10 @@ def run_classify_pdf_cdx(args):
             batch_size=args.jobs,
         )
     else:
-        worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+        worker = PdfTrioWorker(pdftrio_client,
+                               wayback_client,
+                               sink=args.sink,
+                               mode=args.pdftrio_mode)
         pusher = CdxLinePusher(
             worker,
             args.cdx_file,
@@ -52,6 +64,7 @@ def run_classify_pdf_cdx(args):
         )
     pusher.run()
 
+
 def run_classify_pdf_zipfile(args):
     pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
     worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink, mode=args.pdftrio_mode)
@@ -60,48 +73,53 @@ def run_classify_pdf_zipfile(args):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--kafka-mode',
-        action='store_true',
-        help="send output to Kafka (not stdout)")
+                        action='store_true',
+                        help="send output to Kafka (not stdout)")
     parser.add_argument('--kafka-hosts',
-        default="localhost:9092",
-        help="list of Kafka brokers (host/port) to use")
+                        default="localhost:9092",
+                        help="list of Kafka brokers (host/port) to use")
     parser.add_argument('--kafka-env',
-        default="dev",
-        help="Kafka topic namespace to use (eg, prod, qa, dev)")
-    parser.add_argument('-j', '--jobs',
-        default=8, type=int,
-        help="parallelism for batch CPU jobs")
+                        default="dev",
+                        help="Kafka topic namespace to use (eg, prod, qa, dev)")
+    parser.add_argument('-j',
+                        '--jobs',
+                        default=8,
+                        type=int,
+                        help="parallelism for batch CPU jobs")
     parser.add_argument('--pdftrio-host',
-        default="http://pdftrio.qa.fatcat.wiki",
-        help="pdftrio API host/port")
+                        default="http://pdftrio.qa.fatcat.wiki",
+                        help="pdftrio API host/port")
     parser.add_argument('--pdftrio-mode',
-        default="auto",
-        help="which classification mode to use")
+                        default="auto",
+                        help="which classification mode to use")
     subparsers = parser.add_subparsers()
 
-    sub_classify_pdf_json = subparsers.add_parser('classify-pdf-json',
+    sub_classify_pdf_json = subparsers.add_parser(
+        'classify-pdf-json',
         help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion")
     sub_classify_pdf_json.set_defaults(func=run_classify_pdf_json)
     sub_classify_pdf_json.add_argument('json_file',
-        help="JSON file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                                       help="JSON file to import from (or '-' for stdin)",
+                                       type=argparse.FileType('r'))
 
-    sub_classify_pdf_cdx = subparsers.add_parser('classify-pdf-cdx',
+    sub_classify_pdf_cdx = subparsers.add_parser(
+        'classify-pdf-cdx',
         help="for each CDX line, fetches PDF and does pdftrio classify_pdfion")
     sub_classify_pdf_cdx.set_defaults(func=run_classify_pdf_cdx)
     sub_classify_pdf_cdx.add_argument('cdx_file',
-        help="CDX file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                                      help="CDX file to import from (or '-' for stdin)",
+                                      type=argparse.FileType('r'))
 
-    sub_classify_pdf_zipfile = subparsers.add_parser('classify-pdf-zipfile',
-        help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each")
+    sub_classify_pdf_zipfile = subparsers.add_parser(
+        'classify-pdf-zipfile',
+        help=
+        "opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each")
     sub_classify_pdf_zipfile.set_defaults(func=run_classify_pdf_zipfile)
     sub_classify_pdf_zipfile.add_argument('zip_file',
-        help="zipfile with PDFs to classify",
-        type=str)
+                                          help="zipfile with PDFs to classify",
+                                          type=str)
 
     args = parser.parse_args()
     if not args.__dict__.get("func"):
@@ -112,10 +130,10 @@ def main():
     if args.kafka_mode:
         produce_topic = "sandcrawler-{}.pdftrio-output".format(args.kafka_env)
         print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
-        args.sink = KafkaSink(kafka_hosts=args.kafka_hosts,
-            produce_topic=produce_topic)
+        args.sink = KafkaSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
 
     args.func(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/persist_tool.py b/python/persist_tool.py
index d52f7c1..9160db6 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
 
@@ -16,9 +15,7 @@ from sandcrawler.persist import *
 
 
 def run_cdx(args):
-    worker = PersistCdxWorker(
-        db_url=args.db_url,
-    )
+    worker = PersistCdxWorker(db_url=args.db_url, )
     filter_mimetypes = ['application/pdf']
     if args.no_mimetype_filter:
         filter_mimetypes = None
@@ -32,6 +29,7 @@ def run_cdx(args):
     )
     pusher.run()
 
+
 def run_grobid(args):
     worker = PersistGrobidWorker(
         db_url=args.db_url,
@@ -49,24 +47,22 @@ def run_grobid(args):
     )
     pusher.run()
 
+
 def run_grobid_disk(args):
     """
     Writes XML to individual files on disk, and also prints non-XML metadata to
     stdout as JSON, which can be redirected to a separate file.
     """
-    worker = PersistGrobidDiskWorker(
-        output_dir=args.output_dir,
-    )
+    worker = PersistGrobidDiskWorker(output_dir=args.output_dir, )
     pusher = JsonLinePusher(
         worker,
         args.json_file,
     )
     pusher.run()
 
+
 def run_pdftrio(args):
-    worker = PersistPdfTrioWorker(
-        db_url=args.db_url,
-    )
+    worker = PersistPdfTrioWorker(db_url=args.db_url, )
     pusher = JsonLinePusher(
         worker,
         args.json_file,
@@ -74,6 +70,7 @@ def run_pdftrio(args):
     )
     pusher.run()
 
+
 def run_pdftext(args):
     worker = PersistPdfTextWorker(
         db_url=args.db_url,
@@ -91,10 +88,9 @@ def run_pdftext(args):
     )
     pusher.run()
 
+
 def run_ingest_file_result(args):
-    worker = PersistIngestFileResultWorker(
-        db_url=args.db_url,
-    )
+    worker = PersistIngestFileResultWorker(db_url=args.db_url, )
     pusher = JsonLinePusher(
         worker,
         args.json_file,
@@ -102,10 +98,9 @@ def run_ingest_file_result(args):
     )
     pusher.run()
 
+
 def run_ingest_request(args):
-    worker = PersistIngestRequestWorker(
-        db_url=args.db_url,
-    )
+    worker = PersistIngestRequestWorker(db_url=args.db_url, )
     pusher = JsonLinePusher(
         worker,
         args.json_file,
@@ -113,92 +108,95 @@ def run_ingest_request(args):
     )
     pusher.run()
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--db-url',
-        help="postgresql database connection string",
-        default="postgres:///sandcrawler")
-    parser.add_argument('--s3-url',
-        help="S3 (seaweedfs) backend URL",
-        default="localhost:9000")
+                        help="postgresql database connection string",
+                        default="postgres:///sandcrawler")
+    parser.add_argument('--s3-url', help="S3 (seaweedfs) backend URL", default="localhost:9000")
     parser.add_argument('--s3-access-key',
-        help="S3 (seaweedfs) credential",
-        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
+                        help="S3 (seaweedfs) credential",
+                        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY')
+                        or os.environ.get('MINIO_ACCESS_KEY'))
     parser.add_argument('--s3-secret-key',
-        help="S3 (seaweedfs) credential",
-        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
+                        help="S3 (seaweedfs) credential",
+                        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY')
+                        or os.environ.get('MINIO_SECRET_KEY'))
     parser.add_argument('--s3-bucket',
-        help="S3 (seaweedfs) bucket to persist into",
-        default="sandcrawler-dev")
+                        help="S3 (seaweedfs) bucket to persist into",
+                        default="sandcrawler-dev")
     subparsers = parser.add_subparsers()
 
-    sub_cdx = subparsers.add_parser('cdx',
-        help="backfill a CDX file into postgresql cdx table")
+    sub_cdx = subparsers.add_parser('cdx', help="backfill a CDX file into postgresql cdx table")
     sub_cdx.set_defaults(func=run_cdx)
     sub_cdx.add_argument('cdx_file',
-        help="CDX file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
-    sub_cdx.add_argument('--no-mimetype-filter',
+                         help="CDX file to import from (or '-' for stdin)",
+                         type=argparse.FileType('r'))
+    sub_cdx.add_argument(
+        '--no-mimetype-filter',
         action='store_true',
         help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
 
-    sub_grobid = subparsers.add_parser('grobid',
-        help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+    sub_grobid = subparsers.add_parser(
+        'grobid', help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
     sub_grobid.set_defaults(func=run_grobid)
     sub_grobid.add_argument('json_file',
-        help="grobid file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                            help="grobid file to import from (or '-' for stdin)",
+                            type=argparse.FileType('r'))
     sub_grobid.add_argument('--s3-only',
-        action='store_true',
-        help="only upload TEI-XML to S3 (don't write to database)")
-    sub_grobid.add_argument('--db-only',
+                            action='store_true',
+                            help="only upload TEI-XML to S3 (don't write to database)")
+    sub_grobid.add_argument(
+        '--db-only',
         action='store_true',
         help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
 
-    sub_pdftext = subparsers.add_parser('pdftext',
+    sub_pdftext = subparsers.add_parser(
+        'pdftext',
         help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)")
     sub_pdftext.set_defaults(func=run_pdftext)
     sub_pdftext.add_argument('json_file',
-        help="pdftext file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                             help="pdftext file to import from (or '-' for stdin)",
+                             type=argparse.FileType('r'))
     sub_pdftext.add_argument('--s3-only',
-        action='store_true',
-        help="only upload TEI-XML to S3 (don't write to database)")
-    sub_pdftext.add_argument('--db-only',
+                             action='store_true',
+                             help="only upload TEI-XML to S3 (don't write to database)")
+    sub_pdftext.add_argument(
+        '--db-only',
         action='store_true',
         help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
 
     sub_grobid_disk = subparsers.add_parser('grobid-disk',
-        help="dump GRBOID output to (local) files on disk")
+                                            help="dump GRBOID output to (local) files on disk")
     sub_grobid_disk.set_defaults(func=run_grobid_disk)
     sub_grobid_disk.add_argument('json_file',
-        help="grobid file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
-    sub_grobid_disk.add_argument('output_dir',
-        help="base directory to output into",
-        type=str)
+                                 help="grobid file to import from (or '-' for stdin)",
+                                 type=argparse.FileType('r'))
+    sub_grobid_disk.add_argument('output_dir', help="base directory to output into", type=str)
 
-    sub_pdftrio = subparsers.add_parser('pdftrio',
+    sub_pdftrio = subparsers.add_parser(
+        'pdftrio',
         help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
     sub_pdftrio.set_defaults(func=run_pdftrio)
     sub_pdftrio.add_argument('json_file',
-        help="pdftrio file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                             help="pdftrio file to import from (or '-' for stdin)",
+                             type=argparse.FileType('r'))
 
-    sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
-        help="backfill a ingest_file_result JSON dump into postgresql")
+    sub_ingest_file_result = subparsers.add_parser(
+        'ingest-file-result', help="backfill a ingest_file_result JSON dump into postgresql")
     sub_ingest_file_result.set_defaults(func=run_ingest_file_result)
-    sub_ingest_file_result.add_argument('json_file',
+    sub_ingest_file_result.add_argument(
+        'json_file',
         help="ingest_file_result file to import from (or '-' for stdin)",
         type=argparse.FileType('r'))
 
-    sub_ingest_request = subparsers.add_parser('ingest-request',
-        help="backfill a ingest_request JSON dump into postgresql")
+    sub_ingest_request = subparsers.add_parser(
+        'ingest-request', help="backfill a ingest_request JSON dump into postgresql")
     sub_ingest_request.set_defaults(func=run_ingest_request)
     sub_ingest_request.add_argument('json_file',
-        help="ingest_request to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+                                    help="ingest_request to import from (or '-' for stdin)",
+                                    type=argparse.FileType('r'))
 
     args = parser.parse_args()
     if not args.__dict__.get("func"):
@@ -207,5 +205,6 @@ def main():
 
     args.func(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index bf2d92d..46735eb 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,14 +1,16 @@
-
 from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
 from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
-from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, SavePageNowClient,
-                 SavePageNowError, WarcResource, WaybackClient, WaybackContentError, WaybackError)
+from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult,
+                 SavePageNowClient, SavePageNowError, WarcResource, WaybackClient,
+                 WaybackContentError, WaybackError)
 from .ingest_file import IngestFileWorker
 from .ingest_fileset import IngestFilesetWorker
-from .misc import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_datetime, parse_cdx_line
+from .misc import (b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path,
+                   parse_cdx_datetime, parse_cdx_line)
 from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
 from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
-from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, PersistIngestFileResultWorker,
-                      PersistIngestRequestWorker, PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker)
-from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, KafkaJsonPusher, KafkaSink,
-                      MultiprocessWrapper, ZipfilePusher)
+from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker,
+                      PersistIngestFileResultWorker, PersistIngestRequestWorker,
+                      PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker)
+from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink,
+                      KafkaJsonPusher, KafkaSink, MultiprocessWrapper, ZipfilePusher)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 4dcdb0e..360add9 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,4 +1,3 @@
-
 import datetime
 import json
 from typing import Optional
@@ -9,17 +8,16 @@ import requests
 
 
 class SandcrawlerPostgrestClient:
-
     def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs):
         self.api_url = api_url
 
     def get_cdx(self, url):
-        resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
+        resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.' + url))
         resp.raise_for_status()
         return resp.json() or None
 
     def get_grobid(self, sha1):
-        resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
+        resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.' + sha1))
         resp.raise_for_status()
         resp = resp.json()
         if resp:
@@ -28,7 +26,7 @@ class SandcrawlerPostgrestClient:
             return None
 
     def get_pdftrio(self, sha1):
-        resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex='eq.'+sha1))
+        resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex='eq.' + sha1))
         resp.raise_for_status()
         resp = resp.json()
         if resp:
@@ -37,7 +35,7 @@ class SandcrawlerPostgrestClient:
             return None
 
     def get_pdf_meta(self, sha1):
-        resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.'+sha1))
+        resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.' + sha1))
         resp.raise_for_status()
         resp = resp.json()
         if resp:
@@ -58,7 +56,7 @@ class SandcrawlerPostgrestClient:
             return None
 
     def get_file_meta(self, sha1):
-        resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
+        resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.' + sha1))
         resp.raise_for_status()
         resp = resp.json()
         if resp:
@@ -91,7 +89,7 @@ class SandcrawlerPostgrestClient:
             return None
 
     def get_crossref(self, doi):
-        resp = requests.get(self.api_url + "/crossref", params=dict(doi='eq.'+doi))
+        resp = requests.get(self.api_url + "/crossref", params=dict(doi='eq.' + doi))
         resp.raise_for_status()
         resp = resp.json()
         if resp:
@@ -99,8 +97,8 @@ class SandcrawlerPostgrestClient:
         else:
             return None
 
-class SandcrawlerPostgresClient:
 
+class SandcrawlerPostgresClient:
     def __init__(self, db_url, **kwargs):
         self.conn = psycopg2.connect(db_url)
 
@@ -135,14 +133,8 @@ class SandcrawlerPostgresClient:
         batch = [d for d in batch if d.get('warc_path')]
         if not batch:
             return (0, 0)
-        batch = [(d['url'],
-                  d['datetime'],
-                  d['sha1hex'],
-                  d['mimetype'],
-                  d['warc_path'],
-                  int(d['warc_csize']),
-                  int(d['warc_offset']))
-                 for d in batch]
+        batch = [(d['url'], d['datetime'], d['sha1hex'], d['mimetype'], d['warc_path'],
+                  int(d['warc_csize']), int(d['warc_offset'])) for d in batch]
         # filter out duplicate rows by key (url, datetime)
         batch_dict = dict()
         for b in batch:
@@ -170,12 +162,8 @@ class SandcrawlerPostgresClient:
         else:
             raise NotImplementedError("on_conflict: {}".format(on_conflict))
         sql += " RETURNING xmax;"
-        batch = [(d['sha1hex'],
-                  d['sha256hex'],
-                  d['md5hex'],
-                  int(d['size_bytes']),
-                  d['mimetype'])
-                 for d in batch]
+        batch = [(d['sha1hex'], d['sha256hex'], d['md5hex'], int(d['size_bytes']),
+                  d['mimetype']) for d in batch]
         # filter out duplicate rows by key (sha1hex)
         batch_dict = dict()
         for b in batch:
@@ -215,15 +203,15 @@ class SandcrawlerPostgresClient:
                         r[k] = r['metadata'].get(k)
                     r['metadata'].pop(k, None)
                 r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
-        batch = [(d['key'],
-                  d.get('grobid_version') or None,
-                  d['status_code'],
-                  d['status'],
-                  d.get('fatcat_release') or None,
-                  d.get('updated') or datetime.datetime.now(),
-                  d.get('metadata') or None ,
-                 )
-                 for d in batch]
+        batch = [(
+            d['key'],
+            d.get('grobid_version') or None,
+            d['status_code'],
+            d['status'],
+            d.get('fatcat_release') or None,
+            d.get('updated') or datetime.datetime.now(),
+            d.get('metadata') or None,
+        ) for d in batch]
         # filter out duplicate rows by key (sha1hex)
         batch_dict = dict()
         for b in batch:
@@ -331,20 +319,18 @@ class SandcrawlerPostgresClient:
         else:
             raise NotImplementedError("on_conflict: {}".format(on_conflict))
         sql += " RETURNING xmax;"
-        batch = [
-            (
-                d['key'],
-                d.get('updated') or datetime.datetime.now(),
-                d['status_code'],
-                d['status'],
-                d.get('versions', {}).get('pdftrio_version') or None,
-                d.get('versions', {}).get('models_date') or None,
-                d.get('ensemble_score'),
-                d.get('bert_score'),
-                d.get('linear_score'),
-                d.get('image_score'),
-            )
-            for d in batch]
+        batch = [(
+            d['key'],
+            d.get('updated') or datetime.datetime.now(),
+            d['status_code'],
+            d['status'],
+            d.get('versions', {}).get('pdftrio_version') or None,
+            d.get('versions', {}).get('models_date') or None,
+            d.get('ensemble_score'),
+            d.get('bert_score'),
+            d.get('linear_score'),
+            d.get('image_score'),
+        ) for d in batch]
         # filter out duplicate rows by key (sha1hex)
         batch_dict = dict()
         for b in batch:
@@ -373,15 +359,15 @@ class SandcrawlerPostgresClient:
                     extra[k] = r[k]
             if extra:
                 r['extra'] = json.dumps(extra, sort_keys=True)
-        batch = [(d['link_source'],
-                  d['link_source_id'],
-                  d['ingest_type'],
-                  d['base_url'],
-                  d.get('ingest_request_source'),
-                  d.get('release_stage') or None,
-                  d.get('extra') or None,
-                 )
-                 for d in batch]
+        batch = [(
+            d['link_source'],
+            d['link_source_id'],
+            d['ingest_type'],
+            d['base_url'],
+            d.get('ingest_request_source'),
+            d.get('release_stage') or None,
+            d.get('extra') or None,
+        ) for d in batch]
         # filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url)
         batch_dict = dict()
         for b in batch:
@@ -412,16 +398,16 @@ class SandcrawlerPostgresClient:
         else:
             raise NotImplementedError("on_conflict: {}".format(on_conflict))
         sql += " RETURNING xmax;"
-        batch = [(d['ingest_type'],
-                  d['base_url'],
-                  bool(d['hit']),
-                  d['status'],
-                  d.get('terminal_url'),
-                  d.get('terminal_dt'),
-                  d.get('terminal_status_code'),
-                  d.get('terminal_sha1hex'),
-                 )
-                 for d in batch]
+        batch = [(
+            d['ingest_type'],
+            d['base_url'],
+            bool(d['hit']),
+            d['status'],
+            d.get('terminal_url'),
+            d.get('terminal_dt'),
+            d.get('terminal_status_code'),
+            d.get('terminal_sha1hex'),
+        ) for d in batch]
         # filter out duplicate rows by key (ingest_type, base_url)
         batch_dict = dict()
         for b in batch:
@@ -459,23 +445,23 @@ class SandcrawlerPostgresClient:
         else:
             raise NotImplementedError("on_conflict: {}".format(on_conflict))
         sql += " RETURNING xmax;"
-        batch = [(d['ingest_type'],
-                  d['base_url'],
-                  bool(d['hit']),
-                  d['status'],
-                  d.get('platform_name'),
-                  d.get('platform_domain'),
-                  d.get('platform_id'),
-                  d.get('ingest_strategy'),
-                  d.get('total_size'),
-                  d.get('file_count'),
-                  d.get('archiveorg_item_name'),
-                  d.get('archiveorg_item_bundle_path'),
-                  d.get('web_bundle_url'),
-                  d.get('web_bundle_dt'),
-                  d.get('manifest'),
-                 )
-                 for d in batch]
+        batch = [(
+            d['ingest_type'],
+            d['base_url'],
+            bool(d['hit']),
+            d['status'],
+            d.get('platform_name'),
+            d.get('platform_domain'),
+            d.get('platform_id'),
+            d.get('ingest_strategy'),
+            d.get('total_size'),
+            d.get('file_count'),
+            d.get('archiveorg_item_name'),
+            d.get('archiveorg_item_bundle_path'),
+            d.get('web_bundle_url'),
+            d.get('web_bundle_dt'),
+            d.get('manifest'),
+        ) for d in batch]
         # filter out duplicate rows by key (ingest_type, base_url)
         batch_dict = dict()
         for b in batch:
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 92fed37..f3441c9 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -1,4 +1,3 @@
-
 import gzip
 import json
 import sys
@@ -16,17 +15,18 @@ from sandcrawler.ia import ResourceResult
 
 
 class FilesetPlatformHelper():
-
     def __init__(self):
         self.platform_name = 'unknown'
 
-    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+    def match_request(self, request: dict, resource: Optional[ResourceResult],
+                      html_biblio: Optional[BiblioMetadata]) -> bool:
         """
         Does this request look like it matches this platform?
         """
         raise NotImplementedError()
 
-    def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+    def process_request(self, request: dict, resource: Optional[ResourceResult],
+                        html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
         """
         Fetch platform-specific metadata for this request (eg, via API calls)
         """
@@ -40,19 +40,18 @@ class FilesetPlatformHelper():
         # XXX: while developing ArchiveorgFileset path
         #return IngestStrategy.ArchiveorgFileset
         if len(item.manifest) == 1:
-            if total_size < 64*1024*1024:
+            if total_size < 64 * 1024 * 1024:
                 return IngestStrategy.WebFile
             else:
                 return IngestStrategy.ArchiveorgFile
         else:
-            if largest_size < 64*1024*1024 and total_size < 128*1024*1024*1024:
+            if largest_size < 64 * 1024 * 1024 and total_size < 128 * 1024 * 1024 * 1024:
                 return IngestStrategy.WebFileset
             else:
                 return IngestStrategy.ArchiveorgFileset
 
 
 class DataverseHelper(FilesetPlatformHelper):
-
     def __init__(self):
         super().__init__()
         self.platform_name = 'dataverse'
@@ -122,8 +121,8 @@ class DataverseHelper(FilesetPlatformHelper):
             "file_id": file_id,
         }
 
-
-    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+    def match_request(self, request: dict, resource: Optional[ResourceResult],
+                      html_biblio: Optional[BiblioMetadata]) -> bool:
         if resource and resource.terminal_url:
             url = resource.terminal_url
         else:
@@ -146,7 +145,8 @@ class DataverseHelper(FilesetPlatformHelper):
 
         return True
 
-    def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+    def process_request(self, request: dict, resource: Optional[ResourceResult],
+                        html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
         """
         Fetch platform-specific metadata for this request (eg, via API calls)
 
@@ -179,22 +179,27 @@ class DataverseHelper(FilesetPlatformHelper):
 
         if parsed_id['file_id']:
             # XXX: maybe we could support this?
-            raise PlatformScopeError(f"only entire dataverse datasets can be archived with this tool")
+            raise PlatformScopeError(
+                f"only entire dataverse datasets can be archived with this tool")
 
         # 1b. if we didn't get a version number from URL, fetch it from API
         if not dataset_version:
-            resp = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}")
+            resp = self.session.get(
+                f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}"
+            )
             resp.raise_for_status()
             obj = resp.json()
             obj_latest = obj['data']['latestVersion']
             dataset_version = f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
 
         # 2. API fetch
-        resp = self.session.get(f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}")
+        resp = self.session.get(
+            f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}"
+        )
         resp.raise_for_status()
         obj = resp.json()
 
-        obj_latest= obj['data']['latestVersion']
+        obj_latest = obj['data']['latestVersion']
         assert dataset_version == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
         assert platform_id == obj_latest['datasetPersistentId']
 
@@ -212,15 +217,16 @@ class DataverseHelper(FilesetPlatformHelper):
                 extra['version'] = row['version']
             if 'description' in df:
                 extra['description'] = df['description']
-            manifest.append(FilesetManifestFile(
-                path=df.get('originalFileName') or df['filename'],
-                size=df.get('originalFileSize') or df['filesize'],
-                md5=df['md5'],
-                # NOTE: don't get: sha1, sha256
-                mimetype=df['contentType'],
-                platform_url=platform_url,
-                extra=extra or None,
-            ))
+            manifest.append(
+                FilesetManifestFile(
+                    path=df.get('originalFileName') or df['filename'],
+                    size=df.get('originalFileSize') or df['filesize'],
+                    md5=df['md5'],
+                    # NOTE: don't get: sha1, sha256
+                    mimetype=df['contentType'],
+                    platform_url=platform_url,
+                    extra=extra or None,
+                ))
 
         platform_sub_id = platform_id.split('/')[-1]
         archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
@@ -228,7 +234,8 @@ class DataverseHelper(FilesetPlatformHelper):
             # XXX: collection=platform_domain,
             collection="datasets",
             date=obj_latest['releaseTime'].split('T')[0],
-            source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
+            source=
+            f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
         )
         if platform_id.startswith('doi:10.'):
             archiveorg_item_meta['doi'] = platform_id.replace('doi:', '')
@@ -238,7 +245,8 @@ class DataverseHelper(FilesetPlatformHelper):
             elif block['typeName'] == 'depositor':
                 archiveorg_item_meta['creator'] = block['value']
             elif block['typeName'] == 'dsDescription':
-                archiveorg_item_meta['description'] = block['value'][0]['dsDescriptionValue']['value']
+                archiveorg_item_meta['description'] = block['value'][0]['dsDescriptionValue'][
+                    'value']
 
         archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '')
         if obj_latest.get('termsOfUse'):
@@ -252,11 +260,13 @@ class DataverseHelper(FilesetPlatformHelper):
             platform_id=platform_id,
             archiveorg_item_name=archiveorg_item_name,
             archiveorg_item_meta=archiveorg_item_meta,
-            web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
+            web_bundle_url=
+            f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
             # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
             extra=dict(version=dataset_version),
         )
 
+
 def test_parse_dataverse_persistentid():
 
     valid = {
@@ -322,8 +332,8 @@ def test_parse_dataverse_persistentid():
         except ValueError:
             pass
 
-class FigshareHelper(FilesetPlatformHelper):
 
+class FigshareHelper(FilesetPlatformHelper):
     def __init__(self):
         super().__init__()
         self.platform_name = 'figshare'
@@ -346,7 +356,9 @@ class FigshareHelper(FilesetPlatformHelper):
             raise ValueError(f"not a figshare URL: {path}")
 
         comp = comp[2:]
-        if comp[0] in ['dataset',]:
+        if comp[0] in [
+                'dataset',
+        ]:
             comp = comp[1:]
 
         if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit():
@@ -356,7 +368,8 @@ class FigshareHelper(FilesetPlatformHelper):
         else:
             raise ValueError(f"couldn't find figshare identiier: {path}")
 
-    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+    def match_request(self, request: dict, resource: Optional[ResourceResult],
+                      html_biblio: Optional[BiblioMetadata]) -> bool:
 
         if resource and resource.terminal_url:
             url = resource.terminal_url
@@ -381,7 +394,8 @@ class FigshareHelper(FilesetPlatformHelper):
 
         return False
 
-    def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+    def process_request(self, request: dict, resource: Optional[ResourceResult],
+                        html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
         """
         Fetch platform-specific metadata for this request (eg, via API calls)
         """
@@ -397,13 +411,15 @@ class FigshareHelper(FilesetPlatformHelper):
 
         (platform_id, dataset_version) = self.parse_figshare_url_path(components.path)
         assert platform_id.isdigit(), f"expected numeric: {platform_id}"
-        assert dataset_version and dataset_version.isdigit(), f"expected numeric: {dataset_version}"
+        assert dataset_version and dataset_version.isdigit(
+        ), f"expected numeric: {dataset_version}"
 
         # 1b. if we didn't get a version number from URL, fetch it from API
         # TODO: implement this code path
 
         # 2. API fetch
-        resp = self.session.get(f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}")
+        resp = self.session.get(
+            f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}")
         resp.raise_for_status()
         obj = resp.json()
 
@@ -412,18 +428,21 @@ class FigshareHelper(FilesetPlatformHelper):
         if not obj['is_public']:
             raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}')
         if obj['is_embargoed']:
-            raise PlatformRestrictedError(f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})')
+            raise PlatformRestrictedError(
+                f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})'
+            )
 
         manifest = []
         for row in obj['files']:
-            manifest.append(FilesetManifestFile(
-                path=row['name'],
-                size=row['size'],
-                md5=row['computed_md5'],
-                # NOTE: don't get: sha1, sha256, mimetype
-                platform_url=row['download_url'],
-                #extra=dict(),
-            ))
+            manifest.append(
+                FilesetManifestFile(
+                    path=row['name'],
+                    size=row['size'],
+                    md5=row['computed_md5'],
+                    # NOTE: don't get: sha1, sha256, mimetype
+                    platform_url=row['download_url'],
+                    #extra=dict(),
+                ))
             assert not row.get('is_link_only')
 
         authors = []
@@ -451,18 +470,23 @@ class FigshareHelper(FilesetPlatformHelper):
             platform_id=platform_id,
             archiveorg_item_name=archiveorg_item_name,
             archiveorg_item_meta=archiveorg_item_meta,
-            web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+            web_bundle_url=
+            f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
             # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
             extra=dict(version=dataset_version),
         )
 
+
 def test_parse_figshare_url_path():
 
     valid = {
-        "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": ("8987858", "1"),
-        "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": ("8987858", None),
+        "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1":
+            ("8987858", "1"),
+        "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858":
+            ("8987858", None),
         "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
-        "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": ("12127176", "4"),
+        "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4":
+            ("12127176", "4"),
     }
 
     invalid = [
@@ -479,14 +503,15 @@ def test_parse_figshare_url_path():
         except ValueError:
             pass
 
-class ZenodoHelper(FilesetPlatformHelper):
 
+class ZenodoHelper(FilesetPlatformHelper):
     def __init__(self):
         super().__init__()
         self.platform_name = 'zenodo'
         self.session = requests.Session()
 
-    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+    def match_request(self, request: dict, resource: Optional[ResourceResult],
+                      html_biblio: Optional[BiblioMetadata]) -> bool:
 
         if resource and resource.terminal_url:
             url = resource.terminal_url
@@ -499,7 +524,8 @@ class ZenodoHelper(FilesetPlatformHelper):
             return True
         return False
 
-    def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+    def process_request(self, request: dict, resource: Optional[ResourceResult],
+                        html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
         """
         Fetch platform-specific metadata for this request (eg, via API calls)
         """
@@ -535,12 +561,15 @@ class ZenodoHelper(FilesetPlatformHelper):
         assert obj['id'] == int(platform_id)
         work_id = obj['conceptrecid']
         if work_id == obj['id']:
-            raise PlatformScopeError("got a work-level zenodo record, not a versioned record: {work_id}")
+            raise PlatformScopeError(
+                "got a work-level zenodo record, not a versioned record: {work_id}")
 
         zenodo_type = obj['metadata']['resource_type']['type']
 
         if obj['metadata']['access_right'] != 'open':
-            raise PlatformRestrictedError("not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}")
+            raise PlatformRestrictedError(
+                "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}"
+            )
 
         manifest = []
         for row in obj['files']:
@@ -600,11 +629,9 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
         'RAR': 'application/vnd.rar',
         'TAR': 'application/x-tar',
         '7z': 'application/x-7z-compressed',
-
         'HTML': 'text/html',
         'Text': 'text/plain',
         'PDF': 'application/pdf',
-
         'CSV': 'text/csv',
         'XML': 'application/xml',
         'JSON': 'application/json',
@@ -613,17 +640,13 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
         #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
         #'application/vnd.ms-excel', # .xls
         #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
-
-        'MP3': 'audio/mpeg', # .mp3
-
-        'MP4': 'video/mp4', # .mp4
-        'MPEG': 'video/mpeg', # .mpeg
-
+        'MP3': 'audio/mpeg',  # .mp3
+        'MP4': 'video/mp4',  # .mp4
+        'MPEG': 'video/mpeg',  # .mpeg
         'JPEG': 'image/jpeg',
         'GIF': 'image/gif',
         'PNG': 'image/png',
         'TIFF': 'image/tiff',
-
         'Unknown': None,
     }
 
@@ -640,24 +663,27 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
         if f.source != 'original':
             return False
         for suffix in [
-            '_meta.sqlite',
-            '_archive.torrent',
-            '_itemimage.jpg',
-            '_meta.xml',
-            '_thumb.png',
-            '_files.xml',
+                '_meta.sqlite',
+                '_archive.torrent',
+                '_itemimage.jpg',
+                '_meta.xml',
+                '_thumb.png',
+                '_files.xml',
         ]:
             if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
                 return False
         if f.name.startswith('_'):
             return False
         if item_name.startswith('academictorrents_'):
-            for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
+            for suffix in [
+                    '_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib'
+            ]:
                 if f.name == item_name + suffix:
                     return False
         return True
 
-    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+    def match_request(self, request: dict, resource: Optional[ResourceResult],
+                      html_biblio: Optional[BiblioMetadata]) -> bool:
 
         if resource and resource.terminal_url:
             url = resource.terminal_url
@@ -672,20 +698,23 @@ class ArchiveOrgHelper(FilesetPlatformHelper):
                 return True
         return False
 
-    def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
+    def process_request(self, request: dict, resource: Optional[ResourceResult],
+                        html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem:
         """
         Fetch platform-specific metadata for this request (eg, via API calls)
         """
 
         base_url_split = request['base_url'].split('/')
         #print(base_url_split, file=sys.stderr)
-        assert len(base_url_split) in [5,6]
+        assert len(base_url_split) in [5, 6]
         assert base_url_split[0] in ['http:', 'https:']
         assert base_url_split[2] == 'archive.org'
         assert base_url_split[3] in ['details', 'download']
         item_name = base_url_split[4]
         if len(base_url_split) == 6 and base_url_split[5]:
-            raise PlatformScopeError("got an archive.org file path, not download/details page; individual files not handled yet")
+            raise PlatformScopeError(
+                "got an archive.org file path, not download/details page; individual files not handled yet"
+            )
 
         #print(f"  archiveorg processing item={item_name}", file=sys.stderr)
         item = self.session.get_item(item_name)
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index c9f182c..6c25276 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,4 +1,3 @@
-
 import gzip
 import json
 import os
@@ -10,15 +9,15 @@ from typing import Any, Dict, List, Optional, Tuple
 
 import internetarchive
 
-from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
-                                       PlatformScopeError)
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile,
+                                       FilesetPlatformItem, IngestStrategy, PlatformScopeError)
 from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding
+from sandcrawler.ia import (ResourceResult, SavePageNowClient, WaybackClient,
+                            fix_transfer_encoding)
 from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
 
 
 class FilesetIngestStrategy():
-
     def __init__(self):
         #self.ingest_strategy = 'unknown'
         self.success_status = "success"
@@ -31,7 +30,6 @@ class FilesetIngestStrategy():
 
 
 class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
-
     def __init__(self, **kwargs):
         super().__init__()
         self.ingest_strategy = IngestStrategy.ArchiveorgFileset
@@ -61,7 +59,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
             found = False
             for existing in item_files:
                 if existing.name == wanted.path:
-                    if ((existing.sha1 and existing.sha1 == wanted.sha1) or (existing.md5 and existing.md5 == wanted.md5)) and existing.name == wanted.path and existing.size == wanted.size:
+                    if ((existing.sha1 and existing.sha1 == wanted.sha1) or
+                        (existing.md5 and existing.md5 == wanted.md5)
+                        ) and existing.name == wanted.path and existing.size == wanted.size:
                         found = True
                         wanted.status = 'exists'
                         break
@@ -69,7 +69,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
                         wanted.status = 'mismatch-existing'
                         break
             if not found:
-                print(f"  item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}", file=sys.stderr)
+                print(
+                    f"  item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
+                    file=sys.stderr)
                 return None
         return ArchiveStrategyResult(
             ingest_strategy=self.ingest_strategy,
@@ -108,10 +110,11 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
 
             if not os.path.exists(local_path):
                 print(f"  downloading {m.path}", file=sys.stderr)
-                with self.ia_session.get(m.platform_url, stream=True, allow_redirects=True) as r:
+                with self.ia_session.get(m.platform_url, stream=True,
+                                         allow_redirects=True) as r:
                     r.raise_for_status()
                     with open(local_path + '.partial', 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=256*1024):
+                        for chunk in r.iter_content(chunk_size=256 * 1024):
                             f.write(chunk)
                 os.rename(local_path + '.partial', local_path)
                 m.status = 'downloaded-local'
@@ -120,7 +123,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
 
             print(f"  verifying {m.path}", file=sys.stderr)
             file_meta = gen_file_metadata_path(local_path, allow_empty=True)
-            assert file_meta['size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}"
+            assert file_meta[
+                'size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}"
 
             if m.sha1:
                 assert file_meta['sha1hex'] == m.sha1
@@ -142,7 +146,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
                 if file_meta['mimetype'] != m.mimetype and file_meta['mimetype'] != 'text/plain':
                     # these 'tab-separated-values' from dataverse are just noise, don't log them
                     if m.mimetype != 'text/tab-separated-values':
-                        print(f"  WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}", file=sys.stderr)
+                        print(
+                            f"  WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
+                            file=sys.stderr)
                     m.mimetype = file_meta['mimetype']
             else:
                 m.mimetype = file_meta['mimetype']
@@ -158,7 +164,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
                 'remote_name': m.path,
             })
 
-        print(f"  uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", file=sys.stderr)
+        print(
+            f"  uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
+            file=sys.stderr)
         internetarchive.upload(
             item.archiveorg_item_name,
             files=item_files,
@@ -183,25 +191,26 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
 
         return result
 
+
 class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
     """
     ArchiveorgFilesetStrategy currently works fine with individual files. Just
     need to over-ride the ingest_strategy name.
     """
-
     def __init__(self):
         super().__init__()
         self.ingest_strategy = IngestStrategy.ArchiveorgFileset
         self.success_status = "success-file"
 
-class WebFilesetStrategy(FilesetIngestStrategy):
 
+class WebFilesetStrategy(FilesetIngestStrategy):
     def __init__(self, **kwargs):
         super().__init__()
         self.ingest_strategy = IngestStrategy.WebFileset
         self.wayback_client = WaybackClient()
         self.try_spn2 = True
-        self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+        self.spn_client = SavePageNowClient(
+            spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
         self.max_spn_manifest = 20
 
     def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
@@ -218,23 +227,26 @@ class WebFilesetStrategy(FilesetIngestStrategy):
         for m in item.manifest:
             fetch_url = m.platform_url
             if not fetch_url:
-                raise NotImplementedError("require 'platform_url' for each file when doing Web fetching")
+                raise NotImplementedError(
+                    "require 'platform_url' for each file when doing Web fetching")
 
             via = "wayback"
             resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
 
-            if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
+            if self.try_spn2 and (resource == None or
+                                  (resource and resource.status == 'no-capture')):
                 if len(item.manifest) > self.max_spn_manifest:
                     m.status = 'too-much-spn'
                     continue
                 via = "spn2"
-                resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=True)
+                resource = self.spn_client.crawl_resource(fetch_url,
+                                                          self.wayback_client,
+                                                          force_simple_get=True)
 
-            print("[FETCH {:>6}] {}  {}".format(
-                    via,
-                    (resource and resource.status),
-                    (resource and resource.terminal_url) or fetch_url),
-                file=sys.stderr)
+            print("[FETCH {:>6}] {}  {}".format(via, (resource and resource.status),
+                                                (resource and resource.terminal_url)
+                                                or fetch_url),
+                  file=sys.stderr)
 
             m.terminal_url = resource.terminal_url
             m.terminal_dt = resource.terminal_dt
@@ -251,9 +263,11 @@ class WebFilesetStrategy(FilesetIngestStrategy):
             file_meta, html_resource = fix_transfer_encoding(file_meta, resource)
 
             if self.ingest_strategy == "web-file":
-                file_file_meta = file_meta 
+                file_file_meta = file_meta
 
-            if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']) or (m.sha1 and m.sha1 != file_meta['sha1hex']):
+            if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']
+                                                     ) or (m.sha1
+                                                           and m.sha1 != file_meta['sha1hex']):
                 m.status = 'mismatch'
                 continue
 
@@ -280,8 +294,8 @@ class WebFilesetStrategy(FilesetIngestStrategy):
             result.file_resource = file_resource
         return result
 
-class WebFileStrategy(WebFilesetStrategy):
 
+class WebFileStrategy(WebFilesetStrategy):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.ingest_strategy = IngestStrategy.WebFile
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index 8ea136e..606af07 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -1,4 +1,3 @@
-
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -13,6 +12,7 @@ class IngestStrategy(str, Enum):
     ArchiveorgFileset = "archiveorg-fileset"
     ArchiveorgFilesetBundled = "archiveorg-fileset-bundled"
 
+
 class FilesetManifestFile(BaseModel):
     path: str
     size: Optional[int]
@@ -27,6 +27,7 @@ class FilesetManifestFile(BaseModel):
     terminal_url: Optional[str]
     terminal_dt: Optional[str]
 
+
 class FilesetPlatformItem(BaseModel):
     platform_name: str
     platform_status: str
@@ -39,6 +40,7 @@ class FilesetPlatformItem(BaseModel):
     web_base_url: Optional[str]
     web_bundle_url: Optional[str]
 
+
 class ArchiveStrategyResult(BaseModel):
     ingest_strategy: str
     status: str
@@ -49,6 +51,7 @@ class ArchiveStrategyResult(BaseModel):
     bundle_resource: Optional[Any]
     bundle_archiveorg_path: Optional[str]
 
+
 class PlatformScopeError(Exception):
     """
     For incidents where platform helper discovers that the fileset/dataset is
@@ -61,6 +64,7 @@ class PlatformScopeError(Exception):
     """
     pass
 
+
 class PlatformRestrictedError(Exception):
     """
     When datasets are not publicly available on a platform (yet)
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 5242b3a..16bbb01 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,4 +1,3 @@
-
 import requests
 
 from grobid2json import teixml2json
@@ -8,7 +7,6 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 
 class GrobidClient(object):
-
     def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
         self.host_url = host_url
         self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
@@ -34,7 +32,7 @@ class GrobidClient(object):
                 files={
                     'input': blob,
                     'consolidateHeader': self.consolidate_mode,
-                    'consolidateCitations': 0, # too expensive for now
+                    'consolidateCitations': 0,  # too expensive for now
                     'includeRawCitations': 1,
                 },
                 timeout=180.0,
@@ -46,9 +44,7 @@ class GrobidClient(object):
                 'error_msg': 'GROBID request (HTTP POST) timeout',
             }
 
-        info = dict(
-            status_code=grobid_response.status_code,
-        )
+        info = dict(status_code=grobid_response.status_code, )
         if grobid_response.status_code == 200:
             info['status'] = 'success'
             info['tei_xml'] = grobid_response.text
@@ -56,7 +52,8 @@ class GrobidClient(object):
                 # XML is larger than Kafka message size, and much larger than
                 # an article in general; bail out
                 info['status'] = 'error'
-                info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
+                info['error_msg'] = "response XML too large: {} bytes".format(
+                    len(info['tei_xml']))
                 info.pop('tei_xml')
         else:
             # response.text is .content decoded as utf-8
@@ -70,7 +67,13 @@ class GrobidClient(object):
         tei_json = teixml2json(result['tei_xml'], encumbered=False)
         meta = dict()
         biblio = dict()
-        for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+        for k in (
+                'title',
+                'authors',
+                'journal',
+                'date',
+                'doi',
+        ):
             if tei_json.get(k):
                 biblio[k] = tei_json[k]
         meta['biblio'] = biblio
@@ -79,8 +82,8 @@ class GrobidClient(object):
                 meta[k] = tei_json[k]
         return meta
 
-class GrobidWorker(SandcrawlerFetchWorker):
 
+class GrobidWorker(SandcrawlerFetchWorker):
     def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
         super().__init__(wayback_client=wayback_client)
         self.grobid_client = grobid_client
@@ -104,18 +107,19 @@ class GrobidWorker(SandcrawlerFetchWorker):
             return fetch_result
         blob = fetch_result['blob']
 
-        result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+        result = self.grobid_client.process_fulltext(blob,
+                                                     consolidate_mode=self.consolidate_mode)
         result['file_meta'] = gen_file_metadata(blob)
         result['source'] = record
         result['key'] = result['file_meta']['sha1hex']
         return result
 
+
 class GrobidBlobWorker(SandcrawlerWorker):
     """
     This is sort of like GrobidWorker, except it receives blobs directly,
     instead of fetching blobs from some remote store.
     """
-
     def __init__(self, grobid_client, sink=None, **kwargs):
         super().__init__()
         self.grobid_client = grobid_client
@@ -125,8 +129,8 @@ class GrobidBlobWorker(SandcrawlerWorker):
     def process(self, blob, key=None):
         if not blob:
             return None
-        result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+        result = self.grobid_client.process_fulltext(blob,
+                                                     consolidate_mode=self.consolidate_mode)
         result['file_meta'] = gen_file_metadata(blob)
         result['key'] = result['file_meta']['sha1hex']
         return result
-
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 6bdebdd..a44fc67 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,4 +1,3 @@
-
 import json
 import re
 import sys
@@ -6,7 +5,8 @@ import urllib.parse
 
 from bs4 import BeautifulSoup
 
-RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"')
+RESEARCHSQUARE_REGEX = re.compile(
+    r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"')
 IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"')
 OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
 SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
@@ -33,16 +33,16 @@ def extract_fulltext_url(html_url, html_body):
     ### General Tricks ###
 
     # highwire-style meta tag
-    meta = soup.find('meta', attrs={"name":"citation_pdf_url"})
+    meta = soup.find('meta', attrs={"name": "citation_pdf_url"})
     if not meta:
-        meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
+        meta = soup.find('meta', attrs={"name": "bepress_citation_pdf_url"})
     if not meta:
-        meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"})
+        meta = soup.find('meta', attrs={"name": "wkhealth_pdf_url"})
     if not meta:
         # researchgate does this; maybe others also?
-        meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
+        meta = soup.find('meta', attrs={"property": "citation_pdf_url"})
     if not meta:
-        meta = soup.find('meta', attrs={"name":"eprints.document_url"})
+        meta = soup.find('meta', attrs={"name": "eprints.document_url"})
     # if tag is only partially populated
     if meta and not meta.get('content'):
         meta = None
@@ -52,10 +52,10 @@ def extract_fulltext_url(html_url, html_body):
         if '://doi.org/' in url:
             print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
         elif url.startswith('/'):
-            if host_prefix+url == html_url:
+            if host_prefix + url == html_url:
                 print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
             else:
-                return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
+                return dict(pdf_url=host_prefix + url, technique='citation_pdf_url')
         elif url.startswith('http'):
             if url == html_url:
                 print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
@@ -64,7 +64,7 @@ def extract_fulltext_url(html_url, html_body):
         else:
             print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
 
-    meta = soup.find('meta', attrs={"name":"generator"})
+    meta = soup.find('meta', attrs={"name": "generator"})
     meta_generator = None
     if meta and meta.get('content'):
         meta_generator = meta['content'].strip()
@@ -105,7 +105,8 @@ def extract_fulltext_url(html_url, html_body):
                 json_meta = json.loads(json_text)
                 pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
                 # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
-                url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
+                url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams'][
+                    'md5'] + "&pid=" + pdf_meta['queryParams']['pid']
             except (KeyError, TypeError, json.JSONDecodeError):
                 pass
         if url:
@@ -130,7 +131,9 @@ def extract_fulltext_url(html_url, html_body):
         if m:
             url = m.group(1)
             assert len(url) < 4096
-            return dict(release_stage="published", pdf_url=host_prefix+url, technique="ieeexplore")
+            return dict(release_stage="published",
+                        pdf_url=host_prefix + url,
+                        technique="ieeexplore")
     # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313
     if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url:
         # HTML iframe like:
@@ -172,11 +175,12 @@ def extract_fulltext_url(html_url, html_body):
         '://thesiscommons.org/',
     ]
     for domain in OSF_DOMAINS:
-        if domain in html_url and (len(html_url.split('/')) in [4,5] or '/preprints/' in html_url) and '/download' not in html_url:
+        if domain in html_url and (len(html_url.split('/')) in [4, 5] or '/preprints/'
+                                   in html_url) and '/download' not in html_url:
             if not html_url.endswith("/"):
-                next_url = html_url+"/download"
+                next_url = html_url + "/download"
             else:
-                next_url = html_url+"download"
+                next_url = html_url + "download"
             return dict(next_url=next_url, technique='osf-by-url')
 
     # wiley
@@ -199,14 +203,14 @@ def extract_fulltext_url(html_url, html_body):
             url = html_url.replace("/doi/10.", "/doi/pdf/10.")
             return dict(pdf_url=url, technique='archivist-url')
         # <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
-        hrefs = soup.find_all('a', attrs={"target":"_blank"})
+        hrefs = soup.find_all('a', attrs={"target": "_blank"})
         for href in hrefs:
             url = href['href'].strip()
             if "/doi/pdf/" in url:
                 if url.startswith('http'):
                     return dict(pdf_url=url, technique='publisher-href')
                 elif url.startswith('/'):
-                    return dict(pdf_url=host_prefix+url, technique='publisher-href')
+                    return dict(pdf_url=host_prefix + url, technique='publisher-href')
 
     # protocols.io
     # https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6
@@ -248,7 +252,8 @@ def extract_fulltext_url(html_url, html_body):
     if "://ehp.niehs.nih.gov/doi/" in html_url:
         # <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
         if b'/doi/pdf/10.' in html_body:
-            url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
+            url = html_url.replace('/doi/full/10.',
+                                   '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
             return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
 
     # cogentoa.com
@@ -275,7 +280,7 @@ def extract_fulltext_url(html_url, html_body):
     # http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
     if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url:
         # <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&amp;tablename=GZBJLAST2020&amp;dflag=pdfdown&#xA;                      "><i></i>PDF Download</a>
-        href = soup.find('a', attrs={"id":"pdfDown"})
+        href = soup.find('a', attrs={"id": "pdfDown"})
         if href:
             url = href['href'].strip().replace('&#xA;', '')
             if not url.startswith('http'):
@@ -300,7 +305,7 @@ def extract_fulltext_url(html_url, html_body):
 
     # OJS 3 (some)
     if meta_generator and meta_generator.startswith("Open Journal Systems"):
-        href = soup.find('a', attrs={"class":"obj_galley_link file"})
+        href = soup.find('a', attrs={"class": "obj_galley_link file"})
         if href and href.text and "pdf" in href.text.lower():
             url = href['href'].strip()
             if url.startswith('/'):
@@ -329,13 +334,15 @@ def extract_fulltext_url(html_url, html_body):
 
     return dict()
 
+
 def test_regex():
     lines = """
     blah
     var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
     asdf"""
     m = OVID_JOURNAL_URL_REGEX.search(lines)
-    assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+    assert m.group(
+        1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
 
     lines = """
             window.onload = function () {
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index c6725dc..6d27a3a 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,4 +1,3 @@
-
 import datetime
 import sys
 import urllib.parse
@@ -31,9 +30,7 @@ HEAD_META_PATTERNS: Any = {
         "meta[name='dcterms.title']",
         "meta[name='dc.title']",
     ],
-    "subtitle": [
-        "meta[name='prism.subtitle']",
-    ],
+    "subtitle": ["meta[name='prism.subtitle']", ],
     "doi": [
         "meta[name='citation_doi']",
         "meta[name='DOI']",
@@ -43,9 +40,7 @@ HEAD_META_PATTERNS: Any = {
         "meta[name='dc.identifier.doi']",
         "meta[name='dc.identifier'][scheme='doi']",
     ],
-    "pmid": [
-        "meta[name='citation_pmid']",
-    ],
+    "pmid": ["meta[name='citation_pmid']", ],
     "abstract": [
         "meta[name='citation_abstract']",
         "meta[name='bepress_citation_abstract']",
@@ -66,9 +61,7 @@ HEAD_META_PATTERNS: Any = {
         "meta[name='dc.source']",
         "meta[property='og:site_name']",
     ],
-    "container_abbrev": [
-        "meta[name='citation_journal_abbrev']",
-    ],
+    "container_abbrev": ["meta[name='citation_journal_abbrev']", ],
     "raw_date": [
         "meta[name='citation_publication_date']",
         "meta[name='bepress_citation_publication_date']",
@@ -169,9 +162,7 @@ HEAD_META_LIST_PATTERNS: Any = {
         "meta[name='dc.contributor']",
     ],
     # TODO: citation_author_institution
-    "raw_references": [
-        "meta[name='citation_reference']",
-    ],
+    "raw_references": ["meta[name='citation_reference']", ],
     "raw_identifiers": [
         "meta[name='eprints.id_number']",
         "meta[name='dcterms.identifier']",
@@ -260,7 +251,7 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
 
 COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
     {
-        "in_doc_url": "pensoft.net/article/", # also /element/
+        "in_doc_url": "pensoft.net/article/",  # also /element/
         "in_fulltext_url": "/download/fig/",
         "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
         "attr": "href",
@@ -652,12 +643,11 @@ class BiblioMetadata(pydantic.BaseModel):
     component_url: Optional[str]
 
     class Config:
-        json_encoders = {
-            datetime.date: lambda dt: dt.isoformat()
-        }
+        json_encoders = {datetime.date: lambda dt: dt.isoformat()}
 
 
-def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+def html_extract_fulltext_url(doc_url: str, doc: HTMLParser,
+                              patterns: List[dict]) -> Optional[Tuple[str, str]]:
     """
     Tries to quickly extract fulltext URLs using a set of patterns. This
     function is intendend to be generic across various extraction techniques.
@@ -701,6 +691,7 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
         return self_doc_url
     return None
 
+
 def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
 
     meta: Any = dict()
@@ -772,6 +763,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
 
     return BiblioMetadata(**meta)
 
+
 def load_adblock_rules() -> braveblock.Adblocker:
     """
     TODO: consider blocking very generic assets:
@@ -838,7 +830,8 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name
     return resources
 
 
-def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list:
+def html_extract_resources(doc_url: str, doc: HTMLParser,
+                           adblock: braveblock.Adblocker) -> list:
     """
     This function tries to find all the important resources in a page. The
     presumption is that the HTML document is article fulltext, and we want the
@@ -869,10 +862,12 @@ def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Ad
         r['url'] = urllib.parse.urljoin(doc_url, r['url'])
 
     # filter using adblocker
-    resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False]
+    resources = [
+        r for r in resources if adblock.check_network_urls(
+            r['url'], source_url=doc_url, request_type=r['type']) == False
+    ]
 
     # remove duplicates
     resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
 
     return resources
-
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index ca1182f..a8ce193 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1,4 +1,3 @@
-
 # XXX: some broken MRO thing going on in here due to python3 object wrangling
 # in `wayback` library. Means we can't run pylint.
 # pylint: skip-file
@@ -38,6 +37,7 @@ class SandcrawlerBackoffError(Exception):
     """
     pass
 
+
 ResourceResult = namedtuple("ResourceResult", [
     "start_url",
     "hit",
@@ -80,6 +80,7 @@ CdxPartial = namedtuple('CdxPartial', [
     'sha1hex',
 ])
 
+
 def cdx_partial_from_row(full):
     return CdxPartial(
         surt=full.surt,
@@ -91,6 +92,7 @@ def cdx_partial_from_row(full):
         sha1hex=full.sha1hex,
     )
 
+
 def cdx_to_dict(cdx):
     d = {
         "surt": cdx.surt,
@@ -107,6 +109,7 @@ def cdx_to_dict(cdx):
         d['warc_path'] = cdx.warc_path
     return d
 
+
 def fuzzy_match_url(left, right):
     """
     Matches URLs agnostic of http/https (and maybe other normalizations in the
@@ -123,6 +126,7 @@ def fuzzy_match_url(left, right):
         return True
     return False
 
+
 def test_fuzzy_match_url():
     assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
     assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
@@ -137,18 +141,19 @@ def test_fuzzy_match_url():
     assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
     assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
 
+
 class CdxApiError(Exception):
     pass
 
-class CdxApiClient:
 
+class CdxApiClient:
     def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
         self.host_url = host_url
         self.http_session = requests_retry_session(retries=3, backoff_factor=3)
-        cdx_auth_token = kwargs.get('cdx_auth_token',
-            os.environ.get('CDX_AUTH_TOKEN'))
+        cdx_auth_token = kwargs.get('cdx_auth_token', os.environ.get('CDX_AUTH_TOKEN'))
         if not cdx_auth_token:
-            raise Exception("CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)")
+            raise Exception(
+                "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)")
         self.http_session.headers.update({
             'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
             'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
@@ -208,7 +213,8 @@ class CdxApiClient:
         found, because we expect to be looking up a specific full record.
         """
         if len(datetime) != 14:
-            raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
+            raise ValueError(
+                "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
         params = {
             'url': url,
             'from': datetime,
@@ -226,18 +232,28 @@ class CdxApiClient:
                 if retry_sleep > 3:
                     next_sleep = retry_sleep - 3
                     retry_sleep = 3
-                print("  CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+                print("  CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+                      file=sys.stderr)
                 time.sleep(retry_sleep)
-                return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep)
+                return self.fetch(url,
+                                  datetime,
+                                  filter_status_code=filter_status_code,
+                                  retry_sleep=next_sleep)
             raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
         row = resp[0]
         # allow fuzzy http/https match
         if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
             if retry_sleep and retry_sleep > 0:
-                print("  CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+                print("  CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+                      file=sys.stderr)
                 time.sleep(retry_sleep)
-                return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
-            raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row))
+                return self.fetch(url,
+                                  datetime,
+                                  filter_status_code=filter_status_code,
+                                  retry_sleep=None)
+            raise KeyError(
+                "Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(
+                    url, datetime, row))
         if filter_status_code:
             assert row.status_code == filter_status_code
         return row
@@ -311,17 +327,20 @@ class CdxApiClient:
 class WaybackError(Exception):
     pass
 
+
 class WaybackContentError(Exception):
     pass
 
+
 class PetaboxError(Exception):
     pass
 
+
 class NoCaptureError(Exception):
     pass
 
-class WaybackClient:
 
+class WaybackClient:
     def __init__(self, cdx_client=None, **kwargs):
         if cdx_client:
             self.cdx_client = cdx_client
@@ -367,32 +386,42 @@ class WaybackClient:
         if not self.petabox_webdata_secret:
             raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
         if not "/" in warc_path:
-            raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
+            raise ValueError(
+                "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory3(
-                webdata_secret=self.petabox_webdata_secret,
-            ))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory3(webdata_secret=self.petabox_webdata_secret, ))
         try:
             #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
             gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
         except wayback.exception.ResourceUnavailable:
             print("  Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
-            raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            raise PetaboxError(
+                "failed to load file contents from wayback/petabox (ResourceUnavailable)")
         except wayback.exception.InvalidResource:
             print("  Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
-            raise WaybackContentError("failed to load file contents from wayback/petabox (InvalidResource)")
+            raise WaybackContentError(
+                "failed to load file contents from wayback/petabox (InvalidResource)")
         except urllib3.exceptions.ReadTimeoutError as rte:
-            raise PetaboxError("failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(rte))
+            raise PetaboxError(
+                "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".
+                format(rte))
         except ValueError as ve:
-            raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            raise PetaboxError(
+                "failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
         except EOFError as eofe:
-            raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            raise PetaboxError(
+                "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
         except TypeError as te:
-            raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            raise PetaboxError(
+                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+                .format(te))
         except Exception as e:
             if "while decompressing data: invalid block type" in str(e):
-                raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files")
+                raise PetaboxError(
+                    "decompression error fetching WARC record; usually due to bad alexa ARC files"
+                )
             else:
                 raise e
         # Note: could consider a generic "except Exception" here, as we get so
@@ -405,7 +434,8 @@ class WaybackClient:
             raise WaybackContentError("too many HTTP headers (in wayback fetch)")
         location = gwb_record.get_location() or None
 
-        if status_code is None and gwb_record.target_uri.startswith(b"ftp://") and not gwb_record.is_revisit():
+        if status_code is None and gwb_record.target_uri.startswith(
+                b"ftp://") and not gwb_record.is_revisit():
             # TODO: some additional verification here?
             status_code = 226
 
@@ -416,8 +446,9 @@ class WaybackClient:
                 raise WaybackContentError("found revisit record, but won't resolve (loop?)")
             revisit_uri, revisit_dt = gwb_record.refers_to
             if not (revisit_uri and revisit_dt):
-                raise WaybackContentError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
-                    warc_path, offset))
+                raise WaybackContentError(
+                    "revisit record missing URI and/or DT: warc:{} offset:{}".format(
+                        warc_path, offset))
             # convert revisit_dt
             # len("2018-07-24T11:56:49"), or with "Z"
             assert len(revisit_dt) in (19, 20)
@@ -425,7 +456,9 @@ class WaybackClient:
                 revisit_uri = revisit_uri.decode('utf-8')
             if type(revisit_dt) is bytes:
                 revisit_dt = revisit_dt.decode('utf-8')
-            revisit_dt = revisit_dt.replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+            revisit_dt = revisit_dt.replace('-', '').replace(':',
+                                                             '').replace('T',
+                                                                         '').replace('Z', '')
             assert len(revisit_dt) == 14
             try:
                 revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
@@ -443,10 +476,10 @@ class WaybackClient:
                 body = gwb_record.open_raw_content().read()
             except IncompleteRead as ire:
                 raise WaybackError(
-                    "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+                    "failed to read actual file contents from wayback/petabox (IncompleteRead: {})"
+                    .format(ire))
         elif status_code is None:
-            raise WaybackContentError(
-                "got a None status_code in (W)ARC record")
+            raise WaybackContentError("got a None status_code in (W)ARC record")
         return WarcResource(
             status_code=status_code,
             location=location,
@@ -454,7 +487,12 @@ class WaybackClient:
             revisit_cdx=revisit_cdx,
         )
 
-    def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None):
+    def fetch_petabox_body(self,
+                           csize,
+                           offset,
+                           warc_path,
+                           resolve_revisit=True,
+                           expected_status_code=None):
         """
         Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
 
@@ -474,12 +512,10 @@ class WaybackClient:
                 raise KeyError("archived HTTP response (WARC) was not {}: {}".format(
                     expected_status_code,
                     resource.status_code,
-                    )
-                )
+                ))
         elif resource.status_code not in (200, 226):
             raise KeyError("archived HTTP response (WARC) was not 200: {}".format(
-                resource.status_code)
-            )
+                resource.status_code))
 
         return resource.body
 
@@ -514,7 +550,9 @@ class WaybackClient:
         except requests.exceptions.ChunkedEncodingError:
             raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
         except UnicodeDecodeError:
-            raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
+            raise WaybackContentError(
+                "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+                    url))
 
         try:
             resp.raise_for_status()
@@ -526,21 +564,20 @@ class WaybackClient:
         if not "X-Archive-Src" in resp.headers:
             raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
         if not datetime in resp.url:
-            raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+            raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
+                datetime, resp.url))
 
         if cdx_sha1hex:
             # verify that body matches CDX hash
             # TODO: don't need *all* these hashes, just sha1
             file_meta = gen_file_metadata(resp.content)
             if cdx_sha1hex != file_meta['sha1hex']:
-                print("  REPLAY MISMATCH: cdx:{} replay:{}".format(
-                        cdx_sha1hex,
-                        file_meta['sha1hex']),
-                    file=sys.stderr)
-                raise WaybackContentError("replay fetch body didn't match CDX hash cdx:{} body:{}".format(
-                    cdx_sha1hex,
-                    file_meta['sha1hex']),
-                )
+                print("  REPLAY MISMATCH: cdx:{} replay:{}".format(cdx_sha1hex,
+                                                                   file_meta['sha1hex']),
+                      file=sys.stderr)
+                raise WaybackContentError(
+                    "replay fetch body didn't match CDX hash cdx:{} body:{}".format(
+                        cdx_sha1hex, file_meta['sha1hex']), )
         return resp.content
 
     def fetch_replay_redirect(self, url, datetime):
@@ -568,7 +605,9 @@ class WaybackClient:
         except requests.exceptions.TooManyRedirects:
             raise WaybackContentError("redirect loop (wayback replay fetch)")
         except UnicodeDecodeError:
-            raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
+            raise WaybackContentError(
+                "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+                    url))
         try:
             resp.raise_for_status()
         except Exception as e:
@@ -580,7 +619,8 @@ class WaybackClient:
         if not "X-Archive-Src" in resp.headers:
             raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
         if not datetime in resp.url:
-            raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+            raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
+                datetime, resp.url))
 
         redirect_url = resp.headers.get("Location")
         # eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
@@ -622,7 +662,9 @@ class WaybackClient:
         urls_seen = [start_url]
         for i in range(self.max_redirects):
             print("  URL: {}".format(next_url), file=sys.stderr)
-            cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype, closest=closest)
+            cdx_row = self.cdx_client.lookup_best(next_url,
+                                                  best_mimetype=best_mimetype,
+                                                  closest=closest)
             #print(cdx_row, file=sys.stderr)
             if not cdx_row:
                 return ResourceResult(
@@ -776,9 +818,11 @@ class WaybackClient:
 class SavePageNowError(Exception):
     pass
 
+
 class SavePageNowBackoffError(SandcrawlerBackoffError):
     pass
 
+
 SavePageNowResult = namedtuple('SavePageNowResult', [
     'success',
     'status',
@@ -789,13 +833,11 @@ SavePageNowResult = namedtuple('SavePageNowResult', [
     'resources',
 ])
 
-class SavePageNowClient:
 
+class SavePageNowClient:
     def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs):
-        self.ia_access_key = kwargs.get('ia_access_key',
-            os.environ.get('IA_ACCESS_KEY'))
-        self.ia_secret_key = kwargs.get('ia_secret_key',
-            os.environ.get('IA_SECRET_KEY'))
+        self.ia_access_key = kwargs.get('ia_access_key', os.environ.get('IA_ACCESS_KEY'))
+        self.ia_secret_key = kwargs.get('ia_secret_key', os.environ.get('IA_SECRET_KEY'))
         self.v2endpoint = v2endpoint
         self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
         self.v2_session.headers.update({
@@ -886,12 +928,15 @@ class SavePageNowClient:
             },
         )
         if resp.status_code == 429:
-            raise SavePageNowBackoffError("status_code: {}, url: {}".format(resp.status_code, request_url))
+            raise SavePageNowBackoffError("status_code: {}, url: {}".format(
+                resp.status_code, request_url))
         elif resp.status_code != 200:
-            raise SavePageNowError("SPN2 status_code: {}, url: {}".format(resp.status_code, request_url))
+            raise SavePageNowError("SPN2 status_code: {}, url: {}".format(
+                resp.status_code, request_url))
         resp_json = resp.json()
 
-        if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json['message']:
+        if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json[
+                'message']:
             raise SavePageNowBackoffError(resp_json['message'])
         elif not resp_json or 'job_id' not in resp_json or not resp_json['job_id']:
             raise SavePageNowError(
@@ -915,7 +960,8 @@ class SavePageNowClient:
                 final_json = resp.json()
                 break
             else:
-                raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(status, request_url))
+                raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(
+                    status, request_url))
 
         if not final_json:
             raise SavePageNowError("SPN2 timed out (polling count exceeded)")
@@ -923,8 +969,10 @@ class SavePageNowClient:
         # if there was a recent crawl of same URL, fetch the status of that
         # crawl to get correct datetime
         if final_json.get('original_job_id'):
-            print(f"  SPN recent capture: {job_id} -> {final_json['original_job_id']}", file=sys.stderr)
-            resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id']))
+            print(f"  SPN recent capture: {job_id} -> {final_json['original_job_id']}",
+                  file=sys.stderr)
+            resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint,
+                                                             final_json['original_job_id']))
             try:
                 resp.raise_for_status()
             except:
@@ -935,7 +983,8 @@ class SavePageNowClient:
 
         if final_json['status'] == "success":
             if final_json.get('original_url').startswith('/'):
-                print(f"  truncateded URL in JSON: {request_url} {json.dumps(final_json)}", file=sys.stderr)
+                print(f"  truncateded URL in JSON: {request_url} {json.dumps(final_json)}",
+                      file=sys.stderr)
             return SavePageNowResult(
                 True,
                 "success",
@@ -969,15 +1018,17 @@ class SavePageNowClient:
 
         # HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
         if 'gzbd.cnki.net/' in start_url:
-            spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get, capture_outlinks=1)
+            spn_result = self.save_url_now_v2(start_url,
+                                              force_simple_get=force_simple_get,
+                                              capture_outlinks=1)
         else:
             spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
 
         if not spn_result.success:
             status = spn_result.status
             if status in ("error:invalid-url", "error:not-found",
-                    "error:invalid-host-resolution", "error:gateway-timeout",
-                    "error:too-many-redirects", "error:read-timeout"):
+                          "error:invalid-host-resolution", "error:gateway-timeout",
+                          "error:too-many-redirects", "error:read-timeout"):
                 status = status.replace("error:", "")
             elif status in ("error:no-access", "error:forbidden"):
                 status = "forbidden"
@@ -988,7 +1039,8 @@ class SavePageNowClient:
             elif status.startswith("error:"):
                 status = "spn2-" + status
             # despite other errors, call these a failure (so we don't retry)
-            if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")):
+            if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent')
+                                            or spn_result.terminal_url.endswith("cookieSet=1")):
                 status = "blocked-cookie"
             return ResourceResult(
                 start_url=start_url,
@@ -1018,7 +1070,8 @@ class SavePageNowClient:
             )
 
         # don't try to CDX fetch for this common cookie block terminal
-        if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"):
+        if spn_result.terminal_url.endswith(
+                '/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"):
             return ResourceResult(
                 start_url=start_url,
                 hit=False,
@@ -1143,9 +1196,12 @@ class SavePageNowClient:
             )
 
 
-def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
-    if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
-        print("  transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+def fix_transfer_encoding(file_meta: dict,
+                          resource: ResourceResult) -> Tuple[dict, ResourceResult]:
+    if resource.body and file_meta[
+            'mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
+        print("  transfer encoding not stripped: {}".format(resource.cdx.mimetype),
+              file=sys.stderr)
         inner_body = gzip.decompress(resource.body)
         if not inner_body:
             raise Exception("null body inside transfer encoding")
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 137a793..b480cc2 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,4 +1,3 @@
-
 import base64
 import gzip
 import json
@@ -15,18 +14,22 @@ from selectolax.parser import HTMLParser
 from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.grobid import GrobidClient
 from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
-from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
-                            SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
+                                       html_extract_resources, load_adblock_rules)
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError,
+                            ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient,
+                            WaybackContentError, WaybackError, cdx_to_dict,
                             fix_transfer_encoding)
-from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources,
+                                     html_extract_body_teixml, html_guess_platform,
                                      html_guess_scope, quick_fetch_html_resources)
 from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
 from sandcrawler.pdfextract import PdfExtractResult, process_pdf
 from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.xml import xml_reserialize
 
-MAX_BODY_SIZE_BYTES = 128*1024*1024
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
 
 class IngestFileWorker(SandcrawlerWorker):
     """
@@ -54,7 +57,6 @@ class IngestFileWorker(SandcrawlerWorker):
         process_file_hit(ResourceResult) -> response
         process_grobid(ResourceResult)
     """
-
     def __init__(self, sink=None, **kwargs):
         super().__init__()
 
@@ -64,7 +66,8 @@ class IngestFileWorker(SandcrawlerWorker):
             self.wayback_client = WaybackClient()
         self.spn_client = kwargs.get('spn_client')
         if not self.spn_client:
-            self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+            self.spn_client = SavePageNowClient(
+                spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
         self.grobid_client = kwargs.get('grobid_client')
         if not self.grobid_client:
             self.grobid_client = GrobidClient()
@@ -123,13 +126,13 @@ class IngestFileWorker(SandcrawlerWorker):
             "fao.org/glis/",
 
             # Historical non-paper content:
-            "dhz.uni-passau.de/",   # newspapers
-            "digital.ucd.ie/",      # ireland national historical
+            "dhz.uni-passau.de/",  # newspapers
+            "digital.ucd.ie/",  # ireland national historical
 
             # DOI prefixes
-            "doi.org/10.2307/",     # JSTOR; slow and many redirects
-            "doi.org/10.18730/",    # fao.org: database entry
-            "doi.org/10.15468/",    # gbif.org: database entry
+            "doi.org/10.2307/",  # JSTOR; slow and many redirects
+            "doi.org/10.18730/",  # fao.org: database entry
+            "doi.org/10.15468/",  # gbif.org: database entry
 
             # deprecated domain (doesn't redirect correctly)
             "://edoc.mpg.de/",
@@ -173,10 +176,10 @@ class IngestFileWorker(SandcrawlerWorker):
             "video/mpeg",
             "text/plain",
             "text/csv",
-            "text/x-r-source",              # dataverse
-            "text/tab-separated-values",    # dataverse
-            "text/x-rst",                   # dataverse
-            "application/x-rlang-transport", # dataverse
+            "text/x-r-source",  # dataverse
+            "text/tab-separated-values",  # dataverse
+            "text/x-rst",  # dataverse
+            "application/x-rlang-transport",  # dataverse
             "application/json",
             "application/xml",
             "application/pdf",
@@ -194,7 +197,6 @@ class IngestFileWorker(SandcrawlerWorker):
             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
         ]
 
-
     def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
         """
         Check in sandcrawler-db (postgres) to see if we have already ingested
@@ -214,7 +216,10 @@ class IngestFileWorker(SandcrawlerWorker):
         else:
             return None
 
-    def find_resource(self, url, best_mimetype=None, force_recrawl=False) -> Optional[ResourceResult]:
+    def find_resource(self,
+                      url,
+                      best_mimetype=None,
+                      force_recrawl=False) -> Optional[ResourceResult]:
         """
         Looks in wayback for a resource starting at the URL, following any
         redirects. If a hit isn't found, try crawling with SPN.
@@ -222,7 +227,8 @@ class IngestFileWorker(SandcrawlerWorker):
         via = "none"
         resource = None
 
-        if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"):
+        if url.startswith("http://web.archive.org/web/") or url.startswith(
+                "https://web.archive.org/web/"):
             raise NotImplementedError("handling direct wayback links not supported yet")
 
         if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
@@ -243,14 +249,13 @@ class IngestFileWorker(SandcrawlerWorker):
         if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000':
             old_failure = True
 
-        if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure):
+        if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')
+                              or soft404 or old_failure):
             via = "spn2"
             resource = self.spn_client.crawl_resource(url, self.wayback_client)
-        print("[FETCH {:>6}] {}  {}".format(
-                via,
-                (resource and resource.status),
-                (resource and resource.terminal_url) or url),
-            file=sys.stderr)
+        print("[FETCH {:>6}] {}  {}".format(via, (resource and resource.status),
+                                            (resource and resource.terminal_url) or url),
+              file=sys.stderr)
         return resource
 
     def process_existing(self, request: dict, result_row: dict) -> dict:
@@ -262,7 +267,8 @@ class IngestFileWorker(SandcrawlerWorker):
         assert result_row['hit']
         existing_file_meta = self.pgrest_client.get_file_meta(result_row['terminal_sha1hex'])
         existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
-        existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
+        existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'],
+                                                  result_row['terminal_dt'])
         if not (existing_file_meta and existing_grobid and existing_cdx):
             raise NotImplementedError("partially-exsiting records not implemented yet")
         result = {
@@ -281,11 +287,13 @@ class IngestFileWorker(SandcrawlerWorker):
         }
         return result
 
-    def process_file_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
+    def process_file_hit(self, ingest_type: str, resource: ResourceResult,
+                         file_meta: dict) -> dict:
         """
         Run all the necessary processing for a new/fresh ingest hit.
         """
-        if ingest_type in ["dataset-file", "component"] and file_meta['mimetype'] == "application/pdf":
+        if ingest_type in ["dataset-file", "component"
+                           ] and file_meta['mimetype'] == "application/pdf":
             ingest_type = "pdf"
         if ingest_type == "pdf":
             return {
@@ -396,24 +404,26 @@ class IngestFileWorker(SandcrawlerWorker):
         try:
             html_doc = HTMLParser(resource.body)
         except ValueError as ve:
-            return dict(
-                status="html-selectolax-error",
-            )
+            return dict(status="html-selectolax-error", )
         html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
         assert html_biblio
         html_body = html_extract_body_teixml(resource.body)
         html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
-        html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count'))
+        html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio,
+                                      html_body.get('word_count'))
         html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
 
-        if html_scope in ('blocked-captcha','blocked-cookie','blocked-forbidden'):
+        if html_scope in ('blocked-captcha', 'blocked-cookie', 'blocked-forbidden'):
             return dict(
                 status=html_scope,
                 html_biblio=html_biblio_dict,
                 scope=html_scope,
                 platform=html_platform,
             )
-        elif html_scope not in ('article-fulltext','unknown',):
+        elif html_scope not in (
+                'article-fulltext',
+                'unknown',
+        ):
             html_body.pop("tei_xml", None)
             return dict(
                 status="wrong-scope",
@@ -423,7 +433,8 @@ class IngestFileWorker(SandcrawlerWorker):
                 html_body=html_body,
             )
 
-        raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules)
+        raw_resources = html_extract_resources(resource.terminal_url, html_doc,
+                                               self.adblock_rules)
         if len(raw_resources) > self.max_html_resources:
             html_body.pop("tei_xml", None)
             return dict(
@@ -452,7 +463,9 @@ class IngestFileWorker(SandcrawlerWorker):
         try:
             if self.html_quick_mode:
                 print("  WARN: running quick CDX-only fetches", file=sys.stderr)
-                full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when)
+                full_resources = quick_fetch_html_resources(raw_resources,
+                                                            self.wayback_client.cdx_client,
+                                                            when)
             else:
                 full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
         except PetaboxError as e:
@@ -572,7 +585,9 @@ class IngestFileWorker(SandcrawlerWorker):
                     return result
 
             try:
-                resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
+                resource = self.find_resource(next_url,
+                                              best_mimetype,
+                                              force_recrawl=force_recrawl)
             except SavePageNowError as e:
                 result['status'] = 'spn2-error'
                 result['error_message'] = str(e)[:1600]
@@ -650,10 +665,9 @@ class IngestFileWorker(SandcrawlerWorker):
             # here we split based on ingest type to try and extract a next hop
             html_ish_resource = bool(
                 "html" in file_meta['mimetype']
-                or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml"
+                or "xhtml" in file_meta['mimetype']  # matches "application/xhtml+xml"
                 or "application/xml" in file_meta['mimetype']
-                or "text/xml" in file_meta['mimetype']
-            )
+                or "text/xml" in file_meta['mimetype'])
             html_biblio = None
             html_doc = None
             if html_ish_resource and resource.body:
@@ -662,7 +676,8 @@ class IngestFileWorker(SandcrawlerWorker):
                     html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
                     if html_biblio:
                         if not 'html_biblio' in result or html_biblio.title:
-                            result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
+                            result['html_biblio'] = json.loads(
+                                html_biblio.json(exclude_none=True))
                             #print(f"  setting html_biblio: {result['html_biblio']}", file=sys.stderr)
                 except ValueError:
                     pass
@@ -686,18 +701,19 @@ class IngestFileWorker(SandcrawlerWorker):
                 assert next_url
                 next_url = clean_url(next_url)
                 print("[PARSE  {:>6}] {}  {}".format(
-                        ingest_type,
-                        fulltext_url.get('technique'),
-                        next_url,
-                    ),
-                    file=sys.stderr)
+                    ingest_type,
+                    fulltext_url.get('technique'),
+                    next_url,
+                ),
+                      file=sys.stderr)
                 if next_url in hops:
                     result['status'] = 'link-loop'
                     result['error_message'] = "repeated: {}".format(next_url)
                     return result
                 hops.append(next_url)
                 continue
-            elif ingest_type in ("xml", "html", "component") and html_ish_resource and html_biblio:
+            elif ingest_type in ("xml", "html",
+                                 "component") and html_ish_resource and html_biblio:
                 # NOTE: src_fulltext_url is not a thing
                 next_url_found = None
                 if ingest_type == "xml" and html_biblio.xml_fulltext_url:
@@ -711,11 +727,11 @@ class IngestFileWorker(SandcrawlerWorker):
                     next_url = next_url_found
                     technique = "html_biblio"
                     print("[PARSE  {:>6}] {}  {}".format(
-                            ingest_type,
-                            technique,
-                            next_url,
-                        ),
-                        file=sys.stderr)
+                        ingest_type,
+                        technique,
+                        next_url,
+                    ),
+                          file=sys.stderr)
                     if next_url in hops:
                         if ingest_type == "html":
                             # for HTML ingest, we don't count this as a link-loop
@@ -756,7 +772,8 @@ class IngestFileWorker(SandcrawlerWorker):
                 result['status'] = "wrong-mimetype"  # formerly: "other-mimetype"
                 return result
         elif ingest_type == "xml":
-            if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
+            if file_meta['mimetype'] not in ("application/xml", "text/xml",
+                                             "application/jats+xml"):
                 result['status'] = "wrong-mimetype"
                 return result
         elif ingest_type == "html":
@@ -786,18 +803,18 @@ class IngestFileWorker(SandcrawlerWorker):
         result['hit'] = True
         if ingest_type == "pdf":
             print("[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
-                    ingest_type,
-                    result.get('file_meta', {}).get('sha1hex'),
-                    result.get('grobid', {}).get('status_code'),
-                    result.get('pdf_meta', {}).get('status'),
-                ),
-                file=sys.stderr)
+                ingest_type,
+                result.get('file_meta', {}).get('sha1hex'),
+                result.get('grobid', {}).get('status_code'),
+                result.get('pdf_meta', {}).get('status'),
+            ),
+                  file=sys.stderr)
         else:
             print("[SUCCESS {:>5}] sha1:{}".format(
-                    ingest_type,
-                    result.get('file_meta', {}).get('sha1hex'),
-                ),
-                file=sys.stderr)
+                ingest_type,
+                result.get('file_meta', {}).get('sha1hex'),
+            ),
+                  file=sys.stderr)
         return result
 
 
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 11386df..5cbb908 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,4 +1,3 @@
-
 import gzip
 import json
 import sys
@@ -14,17 +13,21 @@ from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, Fileset
 from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
 from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
 from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
-from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
-                            SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
+                                       html_extract_resources, load_adblock_rules)
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError,
+                            ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient,
+                            WaybackContentError, WaybackError, cdx_to_dict,
                             fix_transfer_encoding)
 from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources,
+                                     html_extract_body_teixml, html_guess_platform,
                                      html_guess_scope, quick_fetch_html_resources)
 from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
 from sandcrawler.workers import SandcrawlerWorker
 
-MAX_BODY_SIZE_BYTES = 128*1024*1024
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
 
 class IngestFilesetWorker(IngestFileWorker):
     """
@@ -39,14 +42,13 @@ class IngestFilesetWorker(IngestFileWorker):
        checking to see if content has been archived already)
     4. summarize status
     """
-
     def __init__(self, sink=None, **kwargs):
         super().__init__(sink=None, **kwargs)
 
         self.sink = sink
         self.dataset_platform_helpers = DATASET_PLATFORM_HELPER_TABLE
         self.dataset_strategy_archivers = FILESET_STRATEGY_HELPER_TABLE
-        self.max_total_size = kwargs.get('max_total_size', 64*1024*1024*1024)
+        self.max_total_size = kwargs.get('max_total_size', 64 * 1024 * 1024 * 1024)
         self.max_file_count = kwargs.get('max_file_count', 200)
         self.ingest_file_result_sink = kwargs.get('ingest_file_result_sink')
         self.ingest_file_result_stdout = kwargs.get('ingest_file_result_stdout', False)
@@ -72,11 +74,12 @@ class IngestFilesetWorker(IngestFileWorker):
         raise NotImplementedError("process_existing() not tested or safe yet")
 
     def want(self, request: dict) -> bool:
-        if not request.get('ingest_type') in ('dataset',):
+        if not request.get('ingest_type') in ('dataset', ):
             return False
         return True
 
-    def fetch_resource_iteratively(self, ingest_type: str, base_url: str, force_recrawl: bool) -> dict:
+    def fetch_resource_iteratively(self, ingest_type: str, base_url: str,
+                                   force_recrawl: bool) -> dict:
         """
         This is copypasta from process_file(), should probably refactor.
         """
@@ -174,10 +177,9 @@ class IngestFilesetWorker(IngestFileWorker):
             # here we split based on ingest type to try and extract a next hop
             html_ish_resource = bool(
                 "html" in file_meta['mimetype']
-                or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml"
+                or "xhtml" in file_meta['mimetype']  # matches "application/xhtml+xml"
                 or "application/xml" in file_meta['mimetype']
-                or "text/xml" in file_meta['mimetype']
-            )
+                or "text/xml" in file_meta['mimetype'])
             html_biblio = None
             html_doc = None
             if html_ish_resource and resource.body:
@@ -186,7 +188,8 @@ class IngestFilesetWorker(IngestFileWorker):
                     html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
                     if html_biblio:
                         if not 'html_biblio' in result or html_biblio.title:
-                            result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
+                            result['html_biblio'] = json.loads(
+                                html_biblio.json(exclude_none=True))
                             #print(f"  setting html_biblio: {result['html_biblio']}", file=sys.stderr)
                 except ValueError:
                     pass
@@ -214,7 +217,8 @@ class IngestFilesetWorker(IngestFileWorker):
                     result['status'] = "wrong-mimetype"  # formerly: "other-mimetype"
                     return result
             elif ingest_type == "xml":
-                if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
+                if file_meta['mimetype'] not in ("application/xml", "text/xml",
+                                                 "application/jats+xml"):
                     result['status'] = "wrong-mimetype"
                     return result
             elif ingest_type == "html":
@@ -229,11 +233,10 @@ class IngestFilesetWorker(IngestFileWorker):
         result['_resource'] = resource
         return result
 
-
     def process(self, request: dict, key: Any = None) -> dict:
 
         ingest_type = request.get('ingest_type')
-        if ingest_type not in ("dataset",):
+        if ingest_type not in ("dataset", ):
             raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
 
         # parse/clean URL
@@ -250,7 +253,9 @@ class IngestFilesetWorker(IngestFileWorker):
         #if existing:
         #    return self.process_existing(request, existing)
 
-        result = self.fetch_resource_iteratively(ingest_type, base_url, force_recrawl=force_recrawl)
+        result = self.fetch_resource_iteratively(ingest_type,
+                                                 base_url,
+                                                 force_recrawl=force_recrawl)
         result['request'] = request
         if result.get('status') != None:
             result['request'] = request
@@ -323,14 +328,16 @@ class IngestFilesetWorker(IngestFileWorker):
             return result
         if result['file_count'] > self.max_file_count:
             # hard max, to prevent downstream breakage
-            if result['file_count'] > 10*1000:
+            if result['file_count'] > 10 * 1000:
                 result['manifest'] = result['manifest'][:self.max_file_count]
             result['status'] = 'too-many-files'
             return result
 
         ingest_strategy = platform_helper.chose_strategy(dataset_meta)
         result['ingest_strategy'] = ingest_strategy
-        print(f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}", file=sys.stderr)
+        print(
+            f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}",
+            file=sys.stderr)
 
         strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy)
         if not strategy_helper:
@@ -349,7 +356,8 @@ class IngestFilesetWorker(IngestFileWorker):
             if archive_result.bundle_file_meta:
                 result['fileset_bundle']['file_meta'] = archive_result.bundle_file_meta
             if archive_result.archiveorg_bundle_path:
-                result['fileset_bundle']['archiveorg_bundle_path'] = archive_result.archiveorg_bundle_path
+                result['fileset_bundle'][
+                    'archiveorg_bundle_path'] = archive_result.archiveorg_bundle_path
             if archive_result.bundle_resource:
                 result['fileset_bundle']['terminal'] = dict(
                     terminal_url=archive_result.bundle_resource.terminal_url,
@@ -357,14 +365,16 @@ class IngestFilesetWorker(IngestFileWorker):
                     terminal_status_code=archive_result.bundle_resource.terminal_status_code,
                 )
             if archive_result.bundle_resource.cdx:
-                result['fileset_bundle']['cdx'] = cdx_to_dict(archive_result.bundle_resource.cdx)
+                result['fileset_bundle']['cdx'] = cdx_to_dict(
+                    archive_result.bundle_resource.cdx)
             if archive_result.bundle_resource.revisit_cdx:
-                result['fileset_bundle']['revisit_cdx'] = cdx_to_dict(archive_result.bundle_resource.revisit_cdx)
+                result['fileset_bundle']['revisit_cdx'] = cdx_to_dict(
+                    archive_result.bundle_resource.revisit_cdx)
 
         if ingest_strategy.endswith('-file'):
             result['fileset_file'] = dict()
             if archive_result.file_file_meta:
-                result['fileset_file']['file_meta'] = file_meta=archive_result.file_file_meta,
+                result['fileset_file']['file_meta'] = file_meta = archive_result.file_file_meta,
             if archive_result.file_resource:
                 result['fileset_file']['terminal'] = dict(
                     terminal_url=archive_result.file_resource.terminal_url,
@@ -372,16 +382,20 @@ class IngestFilesetWorker(IngestFileWorker):
                     terminal_status_code=archive_result.file_resource.terminal_status_code,
                 )
                 if archive_result.file_resource.cdx:
-                    result['fileset_file']['cdx'] = cdx_to_dict(archive_result.file_resource.cdx)
+                    result['fileset_file']['cdx'] = cdx_to_dict(
+                        archive_result.file_resource.cdx)
                 if archive_result.file_resource.revisit_cdx:
-                    result['fileset_file']['revisit_cdx'] = cdx_to_dict(archive_result.file_resource.revisit_cdx)
+                    result['fileset_file']['revisit_cdx'] = cdx_to_dict(
+                        archive_result.file_resource.revisit_cdx)
 
         if result['status'].startswith('success'):
             # check that these are still valid
             assert result['file_count'] == len(archive_result.manifest)
-            assert result['total_size'] == sum([m.size for m in archive_result.manifest if m.size])
+            assert result['total_size'] == sum(
+                [m.size for m in archive_result.manifest if m.size])
 
-        if result['status'] == 'success-file' and archive_result.file_resource and archive_result.file_file_meta:
+        if result[
+                'status'] == 'success-file' and archive_result.file_resource and archive_result.file_file_meta:
             file_result = dict(
                 hit=True,
                 status='success',
@@ -397,10 +411,13 @@ class IngestFilesetWorker(IngestFileWorker):
             if archive_result.file_resource.cdx:
                 file_result['cdx'] = cdx_to_dict(archive_result.file_resource.cdx)
             if archive_result.file_resource.revisit_cdx:
-                file_result['revisit_cdx'] = cdx_to_dict(archive_result.file_resource.revisit_cdx)
+                file_result['revisit_cdx'] = cdx_to_dict(
+                    archive_result.file_resource.revisit_cdx)
             file_result['request']['ingest_type'] = request['ingest_type'] + "-file"
             # call the super() (ingest_file) version of process_hit()
-            info = self.process_file_hit(file_result['request']['ingest_type'], archive_result.file_resource, archive_result.file_file_meta)
+            info = self.process_file_hit(file_result['request']['ingest_type'],
+                                         archive_result.file_resource,
+                                         archive_result.file_file_meta)
             file_result.update(info)
             if self.ingest_file_result_sink:
                 self.ingest_file_result_sink.push_record(result.copy())
@@ -410,17 +427,19 @@ class IngestFilesetWorker(IngestFileWorker):
         if result['status'].startswith('success'):
             result['hit'] = True
             print("[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format(
-                    ingest_type,
-                    result['file_count'],
-                    result['total_size'],
-                    ingest_strategy,
-                ), file=sys.stderr)
+                ingest_type,
+                result['file_count'],
+                result['total_size'],
+                ingest_strategy,
+            ),
+                  file=sys.stderr)
         else:
             print("[FAIL    {:>5}] status={} file_count={} total_size={} strategy={}".format(
-                    ingest_type,
-                    result['status'],
-                    result['file_count'],
-                    result['total_size'],
-                    ingest_strategy,
-                ), file=sys.stderr)
+                ingest_type,
+                result['status'],
+                result['file_count'],
+                result['total_size'],
+                ingest_strategy,
+            ),
+                  file=sys.stderr)
         return result
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 9c72dd5..bf25d5d 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,4 +1,3 @@
-
 import argparse
 import datetime
 import io
@@ -11,16 +10,20 @@ import pydantic
 import trafilatura
 from selectolax.parser import HTMLParser
 
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
-from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError,
-                            cdx_to_dict, fix_transfer_encoding)
-from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal
+from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
+                                       html_extract_resources, load_adblock_rules)
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient,
+                            WaybackContentError, cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import (clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
+                              url_fuzzy_equal)
 
 TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
 
+
 def html_extract_body_teixml(doc: bytes) -> dict:
     try:
-        tei_xml = trafilatura.extract(doc,
+        tei_xml = trafilatura.extract(
+            doc,
             output_format='xmltei',
             include_comments=False,
             include_formatting=True,
@@ -33,13 +36,19 @@ def html_extract_body_teixml(doc: bytes) -> dict:
     if tei_xml:
         body_txt = teixml_body_text(tei_xml)
         word_count = len(body_txt.split())
-        return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count)
-    elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+        return dict(status="success",
+                    agent=TRAFILATURA_AGENT,
+                    tei_xml=tei_xml,
+                    word_count=word_count)
+    elif doc.startswith(
+            b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+    ):
         # hack for firstmonday.org
         return html_extract_body_teixml(doc[106:])
     else:
         return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
 
+
 def teixml_body_text(doc_xml: str) -> str:
     ns = {"tei": "http://www.tei-c.org/ns/1.0"}
     tree = ET.fromstring(doc_xml)
@@ -49,6 +58,7 @@ def teixml_body_text(doc_xml: str) -> str:
     else:
         return ""
 
+
 class WebResource(pydantic.BaseModel):
     surt: str
     timestamp: datetime.datetime
@@ -61,16 +71,15 @@ class WebResource(pydantic.BaseModel):
     resource_type: Optional[str]
 
     class Config:
-        json_encoders = {
-            datetime.datetime: lambda dt: dt.isoformat()
-        }
+        json_encoders = {datetime.datetime: lambda dt: dt.isoformat()}
+
 
 class IngestWebResult(pydantic.BaseModel):
     status: str
     hit: bool
     error_message: Optional[str]
     cdx: Optional[dict]
-    terminal: Optional[Any] # TODO
+    terminal: Optional[Any]  # TODO
     request: Optional[Any]  # TODO
     file_meta: Optional[dict]
     html_biblio: Optional[BiblioMetadata]
@@ -84,6 +93,7 @@ class IngestWebResult(pydantic.BaseModel):
             datetime.datetime: lambda dt: dt.isoformat(),
         }
 
+
 class HtmlMetaRow(pydantic.BaseModel):
     sha1hex: str
     status: str
@@ -106,7 +116,7 @@ class HtmlMetaRow(pydantic.BaseModel):
         """
         return (
             self.sha1hex,
-            datetime.datetime.now(), # updated
+            datetime.datetime.now(),  # updated
             self.status,
             self.scope,
             self.has_teixml,
@@ -117,7 +127,8 @@ class HtmlMetaRow(pydantic.BaseModel):
         )
 
 
-def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
+                               when: Optional[datetime.datetime]) -> List[WebResource]:
     """
     This is the lazy version that just does a CDX lookup for each resource.
 
@@ -132,27 +143,30 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
         if not cdx_row:
             raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
         if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']):
-            print(f"  WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr)
+            print(f"  WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}",
+                  file=sys.stderr)
         if not cdx_row.status_code:
             # TODO: fall back to a full fetch?
             print(f"  WARN: skipping revisit record", file=sys.stderr)
             continue
-        full.append(WebResource(
-            surt=cdx_row.surt,
-            timestamp=cdx_row.datetime,
-            url=cdx_row.url,
-            sha1hex=cdx_row.sha1hex,
-            mimetype=cdx_row.mimetype,
-            status_code=cdx_row.status_code,
-            size=None,
-            sha256hex=None,
-            resource_type=resource['type'],
-        ))
+        full.append(
+            WebResource(
+                surt=cdx_row.surt,
+                timestamp=cdx_row.datetime,
+                url=cdx_row.url,
+                sha1hex=cdx_row.sha1hex,
+                mimetype=cdx_row.mimetype,
+                status_code=cdx_row.status_code,
+                size=None,
+                sha256hex=None,
+                resource_type=resource['type'],
+            ))
 
     return full
 
 
-def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
+                         when: Optional[datetime.datetime]) -> List[WebResource]:
     """
     This is the full version which fetches each resource from wayback/petabox
     and calculates additional hashes.
@@ -168,23 +182,28 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
             raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
         file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
         if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
-            raise WaybackContentError(f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}")
-        full.append(WebResource(
-            surt=wayback_resp.cdx.surt,
-            timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
-            url=wayback_resp.cdx.url,
-            sha1hex=file_meta['sha1hex'],
-            mimetype=file_meta['mimetype'],
-            status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code,
-            size=file_meta['size_bytes'],
-            sha256hex=file_meta['sha256hex'],
-            resource_type=resource['type'],
-        ))
+            raise WaybackContentError(
+                f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}"
+            )
+        full.append(
+            WebResource(
+                surt=wayback_resp.cdx.surt,
+                timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+                url=wayback_resp.cdx.url,
+                sha1hex=file_meta['sha1hex'],
+                mimetype=file_meta['mimetype'],
+                status_code=wayback_resp.cdx.status_code
+                or wayback_resp.revisit_cdx.status_code,
+                size=file_meta['size_bytes'],
+                sha256hex=file_meta['sha256hex'],
+                resource_type=resource['type'],
+            ))
 
     return full
 
 
-def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]:
+def html_guess_platform(url: str, doc: HTMLParser,
+                        biblio: Optional[BiblioMetadata]) -> Optional[str]:
 
     generator: Optional[str] = None
     generator_elem = doc.css_first("meta[name='generator']")
@@ -229,7 +248,9 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
 
     return None
 
-def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
+
+def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata],
+                     word_count: Optional[int]) -> str:
     """
     This function tries to guess if an HTML document represents one of:
 
@@ -328,7 +349,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     return "unknown"
 
 
-def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
+def run_single(url: str,
+               timestamp: Optional[str] = None,
+               quick_mode: bool = False) -> IngestWebResult:
 
     adblock = load_adblock_rules()
     wayback_client = WaybackClient()
@@ -375,7 +398,8 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
 
     full_resources: List[WebResource] = []
     if quick_mode:
-        full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+        full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client,
+                                                    when)
     else:
         full_resources = fetch_html_resources(raw_resources, wayback_client, when)
 
@@ -399,14 +423,11 @@ def main() -> None:
         python -m sandcrawler.ingest_html
     """
 
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     subparsers = parser.add_subparsers()
 
-    sub = subparsers.add_parser(
-        "single", help="tries to ingest a single URL, dumps result to stdout"
-    )
+    sub = subparsers.add_parser("single",
+                                help="tries to ingest a single URL, dumps result to stdout")
     sub.set_defaults(func="run_single")
     sub.add_argument(
         "url",
@@ -437,5 +458,6 @@ def main() -> None:
         #func()
         raise NotImplementedError()
 
+
 if __name__ == "__main__":
     main()
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index b617178..188621f 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,4 +1,3 @@
-
 import hashlib
 import io
 import os
@@ -7,7 +6,6 @@ import minio
 
 
 class SandcrawlerMinioClient(object):
-
     def __init__(self, host_url, access_key, secret_key, default_bucket=None):
         """
         host is minio connection string (host:port)
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index cf8c4bd..ddbd95a 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,4 +1,3 @@
-
 import base64
 import datetime
 import hashlib
@@ -19,22 +18,28 @@ def clean_url(s: str) -> str:
         parsed.colon_before_port = b''
     return str(urlcanon.whatwg(parsed))
 
+
 def url_fuzzy_equal(left: str, right: str) -> bool:
     """
     TODO: use proper surt library and canonicalization for this check
     """
-    fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
-    fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
+    fuzzy_left = '://'.join(
+        clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
+    fuzzy_right = '://'.join(
+        clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
     if fuzzy_left == fuzzy_right:
         return True
     elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
         return True
     return False
 
+
 def test_url_fuzzy_equal() -> None:
     assert True == url_fuzzy_equal(
         "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
-        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree")
+        "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree"
+    )
+
 
 def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     """
@@ -45,10 +50,10 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     assert blob is not None
     if not allow_empty:
         assert blob
-    if len(blob) < 1024*1024:
+    if len(blob) < 1024 * 1024:
         mimetype = magic.Magic(mime=True).from_buffer(blob)
     else:
-        mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024*1024)])
+        mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024 * 1024)])
     if mimetype in ("application/xml", "text/xml"):
         # crude checks for XHTML or JATS XML, using only first 1 kB of file
         if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
@@ -70,6 +75,7 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
         mimetype=mimetype,
     )
 
+
 def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
     """
     Variant of gen_file_metadata() which works with files on local disk
@@ -92,7 +98,7 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
     size_bytes = 0
     with open(path, 'rb') as f:
         while True:
-            chunk = f.read(1024*1024)
+            chunk = f.read(1024 * 1024)
             if not chunk:
                 break
             size_bytes += len(chunk)
@@ -108,6 +114,7 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
         mimetype=mimetype,
     )
 
+
 def b32_hex(s: str) -> str:
     """
     Converts a base32-encoded SHA-1 checksum into hex-encoded
@@ -123,6 +130,7 @@ def b32_hex(s: str) -> str:
         raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
     return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
 
+
 NORMAL_MIME = (
     'application/pdf',
     'application/postscript',
@@ -131,6 +139,7 @@ NORMAL_MIME = (
     'application/octet-stream',
 )
 
+
 def normalize_mime(raw: str) -> Optional[str]:
     raw = raw.lower().strip()
     for norm in NORMAL_MIME:
@@ -142,9 +151,7 @@ def normalize_mime(raw: str) -> Optional[str]:
         return 'text/xml'
     if raw.startswith('application/x-pdf'):
         return 'application/pdf'
-    if raw in (
-            '.pdf',
-            ):
+    if raw in ('.pdf', ):
         return 'application/pdf'
     if raw in (
             'application/download',
@@ -154,7 +161,7 @@ def normalize_mime(raw: str) -> Optional[str]:
             'application/octetstream',
             'application/force-download',
             'application/unknown',
-            ):
+    ):
         return 'application/octet-stream'
     return None
 
@@ -193,8 +200,8 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
     offset = cdx[9]
     warc = cdx[10]
 
-    if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
-            and len(sha1b32) == 32 and dt.isdigit()):
+    if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit() and len(sha1b32) == 32
+            and dt.isdigit()):
         return None
 
     if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
@@ -221,6 +228,7 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
         warc_path=warc,
     )
 
+
 def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
     if not dt_str:
         return None
@@ -229,23 +237,39 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
     except Exception:
         return None
 
+
 def test_parse_cdx_datetime() -> None:
     assert parse_cdx_datetime("") == None
     assert parse_cdx_datetime("asdf") == None
     assert parse_cdx_datetime("19930203123045") != None
-    assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+    assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020,
+                                                                     month=10,
+                                                                     day=28,
+                                                                     hour=23,
+                                                                     minute=51,
+                                                                     second=3)
+
 
 def datetime_to_cdx(dt: datetime.datetime) -> str:
     return '%04d%02d%02d%02d%02d%02d' % (
-        dt.year, dt.month, dt.day,
-        dt.hour, dt.minute, dt.second,
+        dt.year,
+        dt.month,
+        dt.day,
+        dt.hour,
+        dt.minute,
+        dt.second,
     )
 
+
 def test_datetime_to_cdx() -> None:
-    assert "20201028235103" == datetime_to_cdx(datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
+    assert "20201028235103" == datetime_to_cdx(
+        datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
+
 
-def requests_retry_session(retries=10, backoff_factor=3,
-        status_forcelist=(500, 502, 504), session=None) -> requests.Session:
+def requests_retry_session(retries=10,
+                           backoff_factor=3,
+                           status_forcelist=(500, 502, 504),
+                           session=None) -> requests.Session:
     """
     From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
     """
@@ -262,6 +286,7 @@ def requests_retry_session(retries=10, backoff_factor=3,
     session.mount('https://', adapter)
     return session
 
+
 def sanitize_fs_path(path: str) -> str:
     """
     From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
@@ -271,6 +296,7 @@ def sanitize_fs_path(path: str) -> str:
     # - making the path relative
     return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
 
+
 def test_sanitize_fs_path() -> None:
     assert sanitize_fs_path("/thing.png") == "thing.png"
     assert sanitize_fs_path("../../thing.png") == "thing.png"
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 2fb34b8..190672d 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,4 +1,3 @@
-
 import datetime
 import json
 import sys
@@ -153,19 +152,20 @@ BAD_PDF_SHA1HEX = [
     "fd9bd560662e070b222d63052830837829c490f0",
 ]
 
+
 @dataclass
 class PdfExtractResult:
     sha1hex: str
     status: str
     error_msg: Optional[str] = None
-    file_meta: Optional[Dict[str,Any]] = None
+    file_meta: Optional[Dict[str, Any]] = None
     text: Optional[str] = None
     page0_thumbnail: Optional[bytes] = None
     has_page0_thumbnail: bool = False
     meta_xml: Optional[str] = None
-    pdf_info: Optional[Dict[str,Any]] = None
-    pdf_extra: Optional[Dict[str,Any]] = None
-    source: Optional[Dict[str,Any]] = None
+    pdf_info: Optional[Dict[str, Any]] = None
+    pdf_extra: Optional[Dict[str, Any]] = None
+    source: Optional[Dict[str, Any]] = None
 
     def to_pdftext_dict(self) -> dict:
         """
@@ -221,7 +221,8 @@ class PdfExtractResult:
             )
         else:
             pdf_extra = dict()
-            for k in ('page_count', 'page0_height', 'page0_width', 'permanent_id', 'pdf_version'):
+            for k in ('page_count', 'page0_height', 'page0_width', 'permanent_id',
+                      'pdf_version'):
                 if record.get(k):
                     pdf_extra[k] = record[k]
             return PdfExtractResult(
@@ -255,7 +256,7 @@ class PdfExtractResult:
             metadata_json = json.dumps(metadata, sort_keys=True)
         return (
             self.sha1hex,
-            datetime.datetime.now(), # updated
+            datetime.datetime.now(),  # updated
             self.status,
             self.has_page0_thumbnail,
             pdf_extra.get('page_count'),
@@ -269,7 +270,7 @@ class PdfExtractResult:
         )
 
 
-def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult:
+def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExtractResult:
     """
     A known issue is that output text is in "physical layout" mode, which means
     columns will be side-by-side. We would prefer a single stream of tokens!
@@ -330,7 +331,8 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
     renderer = poppler.PageRenderer()
     try:
         full_img = renderer.render_page(page0)
-        img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "BGRA", 0, 1)
+        img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw',
+                               "BGRA", 0, 1)
         img.thumbnail(thumb_size, Image.BICUBIC)
         buf = BytesIO()
         img.save(buf, thumb_type)
@@ -356,14 +358,14 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
         )
 
     # Kafka message size limit; cap at about 1 MByte
-    if len(full_text)> 1000000:
+    if len(full_text) > 1000000:
         return PdfExtractResult(
             sha1hex=sha1hex,
             status='text-too-large',
             error_msg="full_text chars: {}".format(len(full_text)),
             file_meta=file_meta,
         )
-    if len(pdf.metadata)> 1000000:
+    if len(pdf.metadata) > 1000000:
         return PdfExtractResult(
             sha1hex=sha1hex,
             status='text-too-large',
@@ -414,8 +416,8 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
         ),
     )
 
-class PdfExtractWorker(SandcrawlerFetchWorker):
 
+class PdfExtractWorker(SandcrawlerFetchWorker):
     def __init__(self, wayback_client=None, sink=None, **kwargs):
         super().__init__(wayback_client=wayback_client)
         self.wayback_client = wayback_client
@@ -445,12 +447,12 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
             self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
         return result.to_pdftext_dict()
 
+
 class PdfExtractBlobWorker(SandcrawlerWorker):
     """
     This is sort of like PdfExtractWorker, except it receives blobs directly,
     instead of fetching blobs from some remote store.
     """
-
     def __init__(self, sink=None, **kwargs):
         super().__init__()
         self.sink = sink
@@ -466,4 +468,3 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
             self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
 
         return result.to_pdftext_dict()
-
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 7d03357..e3d4a54 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,4 +1,3 @@
-
 import time
 
 import requests
@@ -8,7 +7,6 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 
 class PdfTrioClient(object):
-
     def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
         self.host_url = host_url
         self.http_session = requests_retry_session(retries=3, backoff_factor=3)
@@ -51,9 +49,7 @@ class PdfTrioClient(object):
                 'error_msg': 'pdftrio request connection timout',
             }
 
-        info = dict(
-            status_code=pdftrio_response.status_code,
-        )
+        info = dict(status_code=pdftrio_response.status_code, )
         if pdftrio_response.status_code == 200:
             resp_json = pdftrio_response.json()
             assert 'ensemble_score' in resp_json
@@ -72,7 +68,6 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
     """
     This class is basically copied directly from GrobidWorker
     """
-
     def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
         super().__init__(wayback_client=wayback_client)
         self.pdftrio_client = pdftrio_client
@@ -103,12 +98,12 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
             result['timing']['fetch_sec'] = fetch_sec
         return result
 
+
 class PdfTrioBlobWorker(SandcrawlerWorker):
     """
     This is sort of like PdfTrioWorker, except it receives blobs directly,
     instead of fetching blobs from some remote store.
     """
-
     def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
         super().__init__()
         self.pdftrio_client = pdftrio_client
@@ -128,4 +123,3 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
             total_sec=time.time() - start_process,
         )
         return result
-
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 66a36bc..44c03f2 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -1,4 +1,3 @@
-
 """
 cdx
 - read raw CDX, filter
@@ -32,7 +31,6 @@ from sandcrawler.workers import SandcrawlerWorker
 
 
 class PersistCdxWorker(SandcrawlerWorker):
-
     def __init__(self, db_url, **kwargs):
         super().__init__()
         self.db = SandcrawlerPostgresClient(db_url)
@@ -56,8 +54,8 @@ class PersistCdxWorker(SandcrawlerWorker):
         self.db.commit()
         return []
 
-class PersistIngestFileResultWorker(SandcrawlerWorker):
 
+class PersistIngestFileResultWorker(SandcrawlerWorker):
     def __init__(self, db_url, **kwargs):
         super().__init__()
         self.db = SandcrawlerPostgresClient(db_url)
@@ -78,8 +76,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         # backwards compat hacks; transform request to look like current schema
         if raw.get('ingest_type') == 'file':
             raw['ingest_type'] = 'pdf'
-        if (not raw.get('link_source')
-                and raw.get('base_url')
+        if (not raw.get('link_source') and raw.get('base_url')
                 and raw.get('ext_ids', {}).get('doi')
                 and raw['base_url'] == "https://doi.org/{}".format(raw['ext_ids']['doi'])):
             # set link_source(_id) for old ingest requests
@@ -119,7 +116,6 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         if not request['request']:
             request['request'] = None
         return request
-            
 
     def file_result_to_row(self, raw: dict) -> Optional[dict]:
         """
@@ -137,7 +133,8 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         ingest_type = raw['request'].get('ingest_type')
         if ingest_type == 'file':
             ingest_type = 'pdf'
-        if ingest_type not in ('pdf', 'xml', 'html', 'component', 'src', 'dataset', 'dataset-file'):
+        if ingest_type not in ('pdf', 'xml', 'html', 'component', 'src', 'dataset',
+                               'dataset-file'):
             self.counts['skip-ingest-type'] += 1
             return None
         if raw['status'] in ("existing", ):
@@ -153,7 +150,9 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         if terminal:
             result['terminal_url'] = terminal.get('terminal_url') or terminal.get('url')
             result['terminal_dt'] = terminal.get('terminal_dt')
-            result['terminal_status_code'] = terminal.get('terminal_status_code') or terminal.get('status_code') or terminal.get('http_code')
+            result['terminal_status_code'] = terminal.get(
+                'terminal_status_code') or terminal.get('status_code') or terminal.get(
+                    'http_code')
             if result['terminal_status_code']:
                 result['terminal_status_code'] = int(result['terminal_status_code'])
             result['terminal_sha1hex'] = terminal.get('terminal_sha1hex')
@@ -215,9 +214,12 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
             'manifest': raw.get('manifest'),
         }
         if result.get('fileset_bundle'):
-            result['archiveorg_item_bundle_path'] = result['fileset_bundle'].get('archiveorg_item_bundle_path')
-            result['web_bundle_url'] = result['fileset_bundle'].get('terminal', {}).get('terminal_url')
-            result['web_bundle_dt'] = result['fileset_bundle'].get('terminal', {}).get('terminal_dt')
+            result['archiveorg_item_bundle_path'] = result['fileset_bundle'].get(
+                'archiveorg_item_bundle_path')
+            result['web_bundle_url'] = result['fileset_bundle'].get('terminal',
+                                                                    {}).get('terminal_url')
+            result['web_bundle_dt'] = result['fileset_bundle'].get('terminal',
+                                                                   {}).get('terminal_dt')
         return result
 
     def push_batch(self, batch):
@@ -243,7 +245,9 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
 
         # these schemas match, so can just pass through
         cdx_batch = [r['cdx'] for r in batch if r.get('hit') and r.get('cdx')]
-        revisit_cdx_batch = [r['revisit_cdx'] for r in batch if r.get('hit') and r.get('revisit_cdx')]
+        revisit_cdx_batch = [
+            r['revisit_cdx'] for r in batch if r.get('hit') and r.get('revisit_cdx')
+        ]
         cdx_batch.extend(revisit_cdx_batch)
         # filter to full CDX lines, with full warc_paths (not liveweb)
         cdx_batch = [r for r in cdx_batch if r.get('warc_path') and ("/" in r['warc_path'])]
@@ -258,24 +262,31 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
             self.counts['insert-file_meta'] += resp[0]
             self.counts['update-file_meta'] += resp[1]
 
-        html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')]
+        html_meta_batch = [
+            self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')
+        ]
         if html_meta_batch:
             resp = self.db.insert_html_meta(self.cur, html_meta_batch, on_conflict="update")
             self.counts['insert-html_meta'] += resp[0]
             self.counts['update-html_meta'] += resp[1]
 
-        fileset_platform_batch = [self.result_to_platform_row(raw) for raw in batch if raw.get('request', {}).get('ingest_type') == 'dataset' and raw.get('platform_name')]
+        fileset_platform_batch = [
+            self.result_to_platform_row(raw) for raw in batch if
+            raw.get('request', {}).get('ingest_type') == 'dataset' and raw.get('platform_name')
+        ]
         fileset_platform_batch = [p for p in fileset_platform_batch if p]
         if fileset_platform_batch:
-            resp = self.db.insert_ingest_fileset_platform(self.cur, fileset_platform_batch, on_conflict="update")
+            resp = self.db.insert_ingest_fileset_platform(self.cur,
+                                                          fileset_platform_batch,
+                                                          on_conflict="update")
             self.counts['insert-fileset_platform'] += resp[0]
             self.counts['update-fileset_platform'] += resp[1]
 
         self.db.commit()
         return []
 
-class PersistIngestFilesetWorker(SandcrawlerWorker):
 
+class PersistIngestFilesetWorker(SandcrawlerWorker):
     def __init__(self, db_url, **kwargs):
         super().__init__()
         self.db = SandcrawlerPostgresClient(db_url)
@@ -287,8 +298,8 @@ class PersistIngestFilesetWorker(SandcrawlerWorker):
         """
         raise NotImplementedError
 
-class PersistIngestRequestWorker(PersistIngestFileResultWorker):
 
+class PersistIngestRequestWorker(PersistIngestFileResultWorker):
     def __init__(self, db_url, **kwargs):
         super().__init__(db_url=db_url)
 
@@ -315,8 +326,8 @@ class PersistIngestRequestWorker(PersistIngestFileResultWorker):
         self.db.commit()
         return []
 
-class PersistGrobidWorker(SandcrawlerWorker):
 
+class PersistGrobidWorker(SandcrawlerWorker):
     def __init__(self, db_url, **kwargs):
         super().__init__()
         self.grobid = GrobidClient()
@@ -406,7 +417,6 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
 
     This could be refactored into a "Sink" type with an even thinner wrapper.
     """
-
     def __init__(self, output_dir):
         super().__init__()
         self.output_dir = output_dir
@@ -424,7 +434,7 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
 
         if record.get('status_code') != 200 or not record.get('tei_xml'):
             return False
-        assert(len(record['key'])) == 40
+        assert (len(record['key'])) == 40
         p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))
         os.makedirs(os.path.dirname(p), exist_ok=True)
         with open(p, 'w') as f:
@@ -434,7 +444,6 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
 
 
 class PersistPdfTrioWorker(SandcrawlerWorker):
-
     def __init__(self, db_url, **kwargs):
         super().__init__()
         self.db = SandcrawlerPostgresClient(db_url)
@@ -458,7 +467,10 @@ class PersistPdfTrioWorker(SandcrawlerWorker):
         self.counts['insert-pdftrio'] += resp[0]
         self.counts['update-pdftrio'] += resp[1]
 
-        file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')]
+        file_meta_batch = [
+            r['file_meta'] for r in batch
+            if r['pdf_trio']['status'] == "success" and r.get('file_meta')
+        ]
         resp = self.db.insert_file_meta(self.cur, file_meta_batch)
         self.counts['insert-file-meta'] += resp[0]
         self.counts['update-file-meta'] += resp[1]
@@ -473,7 +485,6 @@ class PersistPdfTextWorker(SandcrawlerWorker):
 
     Should keep batch sizes small.
     """
-
     def __init__(self, db_url, **kwargs):
         super().__init__()
         self.s3 = SandcrawlerMinioClient(
@@ -545,7 +556,6 @@ class PersistThumbnailWorker(SandcrawlerWorker):
     This worker *must* be used with raw kakfa mode; thumbnails are *not*
     wrapped in JSON like most sandcrawler kafka messages.
     """
-
     def __init__(self, **kwargs):
         super().__init__()
         self.s3 = SandcrawlerMinioClient(
@@ -583,7 +593,6 @@ class GenericPersistDocWorker(SandcrawlerWorker):
 
     Objects are assumed to be JSON-wrapped strings.
     """
-
     def __init__(self, **kwargs):
         super().__init__()
         self.s3 = SandcrawlerMinioClient(
@@ -624,7 +633,6 @@ class PersistXmlDocWorker(GenericPersistDocWorker):
     Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
     sandcrawler database (SQL).
     """
-
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.s3_extension = kwargs.get('s3_extension', ".jats.xml")
@@ -637,7 +645,6 @@ class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
     Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
     sandcrawler database (SQL).
     """
-
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.s3_extension = kwargs.get('s3_extension', ".tei.xml")
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index d8a4016..7135f4c 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,4 +1,3 @@
-
 import json
 import multiprocessing.pool
 import signal
@@ -21,7 +20,6 @@ class SandcrawlerWorker(object):
     Usually these get "pushed" into by a RecordPusher. Output goes to another
     worker (pipeline-style), or defaults to stdout.
     """
-
     def __init__(self):
         self.counts = Counter()
         self.sink = None
@@ -62,9 +60,9 @@ class SandcrawlerWorker(object):
         multithreading or if signal-based timeouts are used elsewhere in the
         same process.
         """
-
         def timeout_handler(signum, frame):
             raise TimeoutError("timeout processing record")
+
         signal.signal(signal.SIGALRM, timeout_handler)
         resp = None
         signal.alarm(int(timeout))
@@ -72,7 +70,7 @@ class SandcrawlerWorker(object):
             resp = self.push_record(task, key=key)
         except TimeoutError:
             self.counts['timeout'] += 1
-            resp = self.timeout_response(task) # pylint: disable=assignment-from-none
+            resp = self.timeout_response(task)  # pylint: disable=assignment-from-none
             # TODO: what if it is this push_record() itself that is timing out?
             if resp and self.sink:
                 self.sink.push_record(resp)
@@ -113,7 +111,6 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
     Wrapper of SandcrawlerWorker that adds a helper method to fetch blobs (eg,
     PDFs) from wayback, archive.org, or other sources.
     """
-
     def __init__(self, wayback_client, **kwargs):
         super().__init__(**kwargs)
         self.wayback_client = wayback_client
@@ -178,7 +175,8 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
                 )
             blob = resp.content
         else:
-            raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
+            raise ValueError(
+                "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
         if not blob:
             return dict(
                 key=default_key,
@@ -192,8 +190,8 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
             blob=blob,
         )
 
-class MultiprocessWrapper(SandcrawlerWorker):
 
+class MultiprocessWrapper(SandcrawlerWorker):
     def __init__(self, worker, sink, jobs=None):
         self.counts = Counter()
         self.worker = worker
@@ -226,21 +224,21 @@ class MultiprocessWrapper(SandcrawlerWorker):
         print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
         return worker_counts
 
+
 class BlackholeSink(SandcrawlerWorker):
     """
     Dummy SandcrawlerWorker. That doesn't do or process anything.
 
     Useful for tests.
     """
-
     def push_record(self, task, key=None):
         return
 
     def push_batch(self, tasks):
         return
 
-class KafkaSink(SandcrawlerWorker):
 
+class KafkaSink(SandcrawlerWorker):
     def __init__(self, kafka_hosts, produce_topic, **kwargs):
         self.sink = None
         self.counts = Counter()
@@ -249,13 +247,12 @@ class KafkaSink(SandcrawlerWorker):
 
         config = self.producer_config({
             'bootstrap.servers': kafka_hosts,
-            'message.max.bytes': 30000000, # ~30 MBytes; broker is ~50 MBytes
+            'message.max.bytes': 30000000,  # ~30 MBytes; broker is ~50 MBytes
             'api.version.request': True,
             'api.version.fallback.ms': 0,
         })
         self.producer = Producer(config)
 
-
     @staticmethod
     def _fail_fast(err, msg):
         if err is not None:
@@ -270,7 +267,7 @@ class KafkaSink(SandcrawlerWorker):
             'delivery.report.only.error': True,
             'default.topic.config': {
                 'message.timeout.ms': 30000,
-                'request.required.acks': -1, # all brokers must confirm
+                'request.required.acks': -1,  # all brokers must confirm
             }
         })
         return config
@@ -285,11 +282,7 @@ class KafkaSink(SandcrawlerWorker):
             msg = msg.encode('utf-8')
         assert type(msg) == bytes
 
-        self.producer.produce(
-            self.produce_topic,
-            msg,
-            key=key,
-            on_delivery=self._fail_fast)
+        self.producer.produce(self.produce_topic, msg, key=key, on_delivery=self._fail_fast)
         self.counts['produced'] += 1
 
         # check for errors etc
@@ -308,7 +301,6 @@ class KafkaCompressSink(KafkaSink):
     """
     Variant of KafkaSink for large documents. Used for, eg, GROBID output.
     """
-
     def producer_config(self, kafka_config):
         config = kafka_config.copy()
         config.update({
@@ -319,7 +311,7 @@ class KafkaCompressSink(KafkaSink):
             'delivery.report.only.error': True,
             'default.topic.config': {
                 'message.timeout.ms': 30000,
-                'request.required.acks': -1, # all brokers must confirm
+                'request.required.acks': -1,  # all brokers must confirm
             }
         })
         return config
@@ -330,7 +322,6 @@ class RecordPusher:
     Base class for different record sources to be pushed into workers. Pretty
     trivial interface, just wraps an importer and pushes records in to it.
     """
-
     def __init__(self, worker, **kwargs):
         self.counts = Counter()
         self.worker = worker
@@ -348,7 +339,6 @@ class RecordPusher:
 
 
 class JsonLinePusher(RecordPusher):
-
     def __init__(self, worker, json_file, **kwargs):
         self.counts = Counter()
         self.worker = worker
@@ -387,7 +377,6 @@ class JsonLinePusher(RecordPusher):
 
 
 class CdxLinePusher(RecordPusher):
-
     def __init__(self, worker, cdx_file, **kwargs):
         self.counts = Counter()
         self.worker = worker
@@ -409,7 +398,8 @@ class CdxLinePusher(RecordPusher):
             if not record:
                 self.counts['skip-parse'] += 1
                 continue
-            if self.filter_http_statuses and record['http_status'] not in self.filter_http_statuses:
+            if self.filter_http_statuses and record[
+                    'http_status'] not in self.filter_http_statuses:
                 self.counts['skip-http_status'] += 1
                 continue
             if self.filter_mimetypes and record['mimetype'] not in self.filter_mimetypes:
@@ -434,7 +424,6 @@ class CdxLinePusher(RecordPusher):
 
 
 class ZipfilePusher(RecordPusher):
-
     def __init__(self, worker, zipfile_path, **kwargs):
         self.counts = Counter()
         self.worker = worker
@@ -472,8 +461,8 @@ class ZipfilePusher(RecordPusher):
         print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
         return self.counts
 
-class KafkaJsonPusher(RecordPusher):
 
+class KafkaJsonPusher(RecordPusher):
     def __init__(self, worker, kafka_hosts, consume_topic, group, **kwargs):
         self.counts = Counter()
         self.worker = worker
@@ -499,12 +488,11 @@ class KafkaJsonPusher(RecordPusher):
             # case where there there is one update and thousands of creates;
             # update would be lingering in worker, and if worker crashed
             # never created. Not great.
-            batch = self.consumer.consume(
-                num_messages=self.batch_size,
-                timeout=self.poll_interval)
+            batch = self.consumer.consume(num_messages=self.batch_size,
+                                          timeout=self.poll_interval)
             print("... got {} kafka messages ({}sec poll interval)".format(
-                    len(batch), self.poll_interval),
-                file=sys.stderr)
+                len(batch), self.poll_interval),
+                  file=sys.stderr)
             if not batch:
                 # TODO: could have some larger timeout here and
                 # self.worker.finish() if it's been more than, eg, a couple
@@ -541,7 +529,9 @@ class KafkaJsonPusher(RecordPusher):
                     while not done:
                         try:
                             # use timeouts; don't want kafka itself to timeout
-                            self.worker.push_record_timeout(record, key=msg.key(), timeout=self.process_timeout_sec)
+                            self.worker.push_record_timeout(record,
+                                                            key=msg.key(),
+                                                            timeout=self.process_timeout_sec)
                             break
                         except SandcrawlerBackoffError as be:
                             print("Backing off for 200 seconds: {}".format(be))
@@ -611,14 +601,14 @@ def make_kafka_consumer(hosts, consume_topic, group):
         for p in partitions:
             if p.error:
                 raise KafkaException(p.error)
-        print("Kafka partitions rebalanced: {} / {}".format(
-                consumer, partitions),
-            file=sys.stderr)
+        print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions),
+              file=sys.stderr)
 
     consumer = Consumer(conf)
     # NOTE: it's actually important that topic_name *not* be bytes (UTF-8
     # encoded)
-    consumer.subscribe([topic_name],
+    consumer.subscribe(
+        [topic_name],
         on_assign=on_rebalance,
         on_revoke=on_rebalance,
     )
diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py
index 7a0086d..83d53d4 100644
--- a/python/sandcrawler/xml.py
+++ b/python/sandcrawler/xml.py
@@ -1,4 +1,3 @@
-
 import xml.etree.ElementTree as ET
 
 
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index e185fad..3c76c17 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 These are generally for continuously running workers that consume from Kafka.
 Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
@@ -31,12 +30,8 @@ def run_grobid_extract(args):
         kafka_hosts=args.kafka_hosts,
         produce_topic=produce_topic,
     )
-    grobid_client = GrobidClient(
-        host_url=args.grobid_host,
-    )
-    wayback_client = WaybackClient(
-        host_url=args.grobid_host,
-    )
+    grobid_client = GrobidClient(host_url=args.grobid_host, )
+    wayback_client = WaybackClient(host_url=args.grobid_host, )
     worker = GrobidWorker(
         grobid_client=grobid_client,
         wayback_client=wayback_client,
@@ -51,6 +46,7 @@ def run_grobid_extract(args):
     )
     pusher.run()
 
+
 def run_pdf_extract(args):
     consume_topic = "sandcrawler-{}.unextracted".format(args.env)
     pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
@@ -63,9 +59,7 @@ def run_pdf_extract(args):
         kafka_hosts=args.kafka_hosts,
         produce_topic=thumbnail_topic,
     )
-    wayback_client = WaybackClient(
-        host_url=args.grobid_host,
-    )
+    wayback_client = WaybackClient(host_url=args.grobid_host, )
     worker = PdfExtractWorker(
         wayback_client=wayback_client,
         sink=pdftext_sink,
@@ -81,6 +75,7 @@ def run_pdf_extract(args):
     )
     pusher.run()
 
+
 def run_persist_grobid(args):
     consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
     worker = PersistGrobidWorker(
@@ -105,6 +100,7 @@ def run_persist_grobid(args):
     )
     pusher.run()
 
+
 def run_persist_pdftext(args):
     consume_topic = "sandcrawler-{}.pdf-text".format(args.env)
     worker = PersistPdfTextWorker(
@@ -129,6 +125,7 @@ def run_persist_pdftext(args):
     )
     pusher.run()
 
+
 def run_persist_thumbnail(args):
     consume_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
     worker = PersistThumbnailWorker(
@@ -150,6 +147,7 @@ def run_persist_thumbnail(args):
     )
     pusher.run()
 
+
 def run_persist_xml_doc(args: argparse.Namespace) -> None:
     consume_topic = f"sandcrawler-{args.env}.xml-doc"
     worker = PersistXmlDocWorker(
@@ -168,6 +166,7 @@ def run_persist_xml_doc(args: argparse.Namespace) -> None:
     )
     pusher.run()
 
+
 def run_persist_html_teixml(args: argparse.Namespace) -> None:
     consume_topic = f"sandcrawler-{args.env}.html-teixml"
     worker = PersistHtmlTeiXmlWorker(
@@ -186,11 +185,10 @@ def run_persist_html_teixml(args: argparse.Namespace) -> None:
     )
     pusher.run()
 
+
 def run_persist_pdftrio(args):
     consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
-    worker = PersistPdfTrioWorker(
-        db_url=args.db_url,
-    )
+    worker = PersistPdfTrioWorker(db_url=args.db_url, )
     pusher = KafkaJsonPusher(
         worker=worker,
         kafka_hosts=args.kafka_hosts,
@@ -201,6 +199,7 @@ def run_persist_pdftrio(args):
     )
     pusher.run()
 
+
 def run_ingest_file(args):
     spn_cdx_retry_sec = 9.0
     if args.bulk:
@@ -228,9 +227,7 @@ def run_ingest_file(args):
         kafka_hosts=args.kafka_hosts,
         produce_topic=grobid_topic,
     )
-    grobid_client = GrobidClient(
-        host_url=args.grobid_host,
-    )
+    grobid_client = GrobidClient(host_url=args.grobid_host, )
     pdftext_sink = KafkaCompressSink(
         kafka_hosts=args.kafka_hosts,
         produce_topic=pdftext_topic,
@@ -268,11 +265,10 @@ def run_ingest_file(args):
     )
     pusher.run()
 
+
 def run_persist_ingest_file(args):
     consume_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
-    worker = PersistIngestFileResultWorker(
-        db_url=args.db_url,
-    )
+    worker = PersistIngestFileResultWorker(db_url=args.db_url, )
     pusher = KafkaJsonPusher(
         worker=worker,
         kafka_hosts=args.kafka_hosts,
@@ -283,90 +279,116 @@ def run_persist_ingest_file(args):
     )
     pusher.run()
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--kafka-hosts',
-        default="localhost:9092",
-        help="list of Kafka brokers (host/port) to use")
+                        default="localhost:9092",
+                        help="list of Kafka brokers (host/port) to use")
     parser.add_argument('--env',
-        default="dev",
-        help="Kafka topic namespace to use (eg, prod, qa, dev)")
+                        default="dev",
+                        help="Kafka topic namespace to use (eg, prod, qa, dev)")
     parser.add_argument('--grobid-host',
-        default="http://grobid.qa.fatcat.wiki",
-        help="GROBID API host/port")
+                        default="http://grobid.qa.fatcat.wiki",
+                        help="GROBID API host/port")
     parser.add_argument('--db-url',
-        help="postgresql database connection string",
-        default="postgres:///sandcrawler")
-    parser.add_argument('--s3-url',
-        help="S3 (seaweedfs) backend URL",
-        default="localhost:9000")
+                        help="postgresql database connection string",
+                        default="postgres:///sandcrawler")
+    parser.add_argument('--s3-url', help="S3 (seaweedfs) backend URL", default="localhost:9000")
     parser.add_argument('--s3-access-key',
-        help="S3 (seaweedfs) credential",
-        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
+                        help="S3 (seaweedfs) credential",
+                        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY')
+                        or os.environ.get('MINIO_ACCESS_KEY'))
     parser.add_argument('--s3-secret-key',
-        help="S3 (seaweedfs) credential",
-        default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') or os.environ.get('MINIO_SECRET_KEY'))
+                        help="S3 (seaweedfs) credential",
+                        default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY')
+                        or os.environ.get('MINIO_SECRET_KEY'))
     parser.add_argument('--s3-bucket',
-        help="S3 (seaweedfs) bucket to persist into",
-        default="sandcrawler-dev")
+                        help="S3 (seaweedfs) bucket to persist into",
+                        default="sandcrawler-dev")
     subparsers = parser.add_subparsers()
 
-    sub_grobid_extract = subparsers.add_parser('grobid-extract',
-        help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka")
+    sub_grobid_extract = subparsers.add_parser(
+        'grobid-extract',
+        help=
+        "daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka"
+    )
     sub_grobid_extract.set_defaults(func=run_grobid_extract)
 
-    sub_pdf_extract = subparsers.add_parser('pdf-extract',
-        help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka")
+    sub_pdf_extract = subparsers.add_parser(
+        'pdf-extract',
+        help=
+        "daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka"
+    )
     sub_pdf_extract.set_defaults(func=run_pdf_extract)
 
-    sub_persist_grobid = subparsers.add_parser('persist-grobid',
-        help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres")
+    sub_persist_grobid = subparsers.add_parser(
+        'persist-grobid',
+        help=
+        "daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres"
+    )
     sub_persist_grobid.add_argument('--s3-only',
-        action='store_true',
-        help="only upload TEI-XML to S3 (don't write to database)")
-    sub_persist_grobid.add_argument('--db-only',
+                                    action='store_true',
+                                    help="only upload TEI-XML to S3 (don't write to database)")
+    sub_persist_grobid.add_argument(
+        '--db-only',
         action='store_true',
         help="only write status to database (don't upload TEI-XML to S3)")
     sub_persist_grobid.set_defaults(func=run_persist_grobid)
 
-    sub_persist_pdftext = subparsers.add_parser('persist-pdftext',
-        help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres")
+    sub_persist_pdftext = subparsers.add_parser(
+        'persist-pdftext',
+        help=
+        "daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres"
+    )
     sub_persist_pdftext.add_argument('--s3-only',
-        action='store_true',
-        help="only upload TEI-XML to S3 (don't write to database)")
-    sub_persist_pdftext.add_argument('--db-only',
+                                     action='store_true',
+                                     help="only upload TEI-XML to S3 (don't write to database)")
+    sub_persist_pdftext.add_argument(
+        '--db-only',
         action='store_true',
         help="only write status to database (don't upload TEI-XML to S3)")
     sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
 
-    sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail',
-        help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")
+    sub_persist_thumbnail = subparsers.add_parser(
+        'persist-thumbnail',
+        help=
+        "daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres"
+    )
     sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
 
-    sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc',
-        help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket")
+    sub_persist_xml_doc = subparsers.add_parser(
+        'persist-xml-doc',
+        help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket"
+    )
     sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
 
-    sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml',
-        help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket")
+    sub_persist_html_teixml = subparsers.add_parser(
+        'persist-html-teixml',
+        help=
+        "daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket"
+    )
     sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
 
-    sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
+    sub_persist_pdftrio = subparsers.add_parser(
+        'persist-pdftrio',
         help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
     sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
 
-    sub_ingest_file = subparsers.add_parser('ingest-file',
+    sub_ingest_file = subparsers.add_parser(
+        'ingest-file',
         help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka")
     sub_ingest_file.add_argument('--bulk',
-        action='store_true',
-        help="consume from bulk kafka topic (eg, for ingest backfill)")
-    sub_ingest_file.add_argument('--priority',
+                                 action='store_true',
+                                 help="consume from bulk kafka topic (eg, for ingest backfill)")
+    sub_ingest_file.add_argument(
+        '--priority',
         action='store_true',
         help="consume from priority kafka topic (eg, for SPN requests)")
     sub_ingest_file.set_defaults(func=run_ingest_file)
 
-    sub_persist_ingest_file = subparsers.add_parser('persist-ingest-file',
+    sub_persist_ingest_file = subparsers.add_parser(
+        'persist-ingest-file',
         help="daemon that consumes ingest-file output from Kafka and pushes to postgres")
     sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file)
 
@@ -377,5 +399,6 @@ def main():
 
     args.func(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 69fe320..9cc9055 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 This script is intended to be used for backfill ingest of old crawls. It can
 also be used as a fast path for getting freshly crawled content into fatcat if
@@ -36,37 +35,35 @@ def run(args):
             },
         }
         if args.release_stage:
-            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
+            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft',
+                                          'update')
             request['release_stage'] = args.release_stage
 
         print("{}".format(json.dumps(request, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--link-source',
-        required=True,
-        help="link_source to include in request")
-    parser.add_argument('--extid-type',
-        required=True,
-        help="extid to encode identifier as")
+                        required=True,
+                        help="link_source to include in request")
+    parser.add_argument('--extid-type', required=True, help="extid to encode identifier as")
     parser.add_argument('--ingest-type',
-        default="pdf",
-        help="ingest type (pdf, html, xml, etc)")
+                        default="pdf",
+                        help="ingest type (pdf, html, xml, etc)")
     parser.add_argument('--ingest-request-source',
-        default="arabesque",
-        help="to include in request")
-    parser.add_argument('--release-stage',
-        default=None,
-        help="to include in request")
+                        default="arabesque",
+                        help="to include in request")
+    parser.add_argument('--release-stage', default=None, help="to include in request")
     parser.add_argument('json_file',
-        help="arabesque output file to use",
-        type=argparse.FileType('r'))
+                        help="arabesque output file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 86ca062..83c04e3 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -23,11 +23,9 @@ FORMAT_TO_MIMETYPE = {
     'RAR': 'application/vnd.rar',
     'TAR': 'application/x-tar',
     '7z': 'application/x-7z-compressed',
-
     'HTML': 'text/html',
     'Text': 'text/plain',
     'PDF': 'application/pdf',
-
     'CSV': 'text/csv',
     'XML': 'application/xml',
     'JSON': 'application/json',
@@ -36,20 +34,17 @@ FORMAT_TO_MIMETYPE = {
     #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
     #'application/vnd.ms-excel', # .xls
     #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
-
-    'MP3': 'audio/mpeg', # .mp3
-
-    'MP4': 'video/mp4', # .mp4
-    'MPEG': 'video/mpeg', # .mpeg
-
+    'MP3': 'audio/mpeg',  # .mp3
+    'MP4': 'video/mp4',  # .mp4
+    'MPEG': 'video/mpeg',  # .mpeg
     'JPEG': 'image/jpeg',
     'GIF': 'image/gif',
     'PNG': 'image/png',
     'TIFF': 'image/tiff',
-
     'Unknown': None,
 }
 
+
 def want_file(f: dict, item_name: str) -> bool:
     """
     Filters IA API files
@@ -57,12 +52,12 @@ def want_file(f: dict, item_name: str) -> bool:
     if f.source != 'original':
         return False
     for suffix in [
-        '_meta.sqlite',
-        '_archive.torrent',
-        '_itemimage.jpg',
-        '_meta.xml',
-        '_thumb.png',
-        '_files.xml',
+            '_meta.sqlite',
+            '_archive.torrent',
+            '_itemimage.jpg',
+            '_meta.xml',
+            '_thumb.png',
+            '_files.xml',
     ]:
         if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
             return False
@@ -74,6 +69,7 @@ def want_file(f: dict, item_name: str) -> bool:
                 return False
     return True
 
+
 def parse_file(f: dict) -> dict:
     """
     Takes an IA API file and turns it in to a fatcat fileset manifest file
@@ -93,6 +89,7 @@ def parse_file(f: dict) -> dict:
         mf['extra'] = dict(mimetype=mimetype)
     return mf
 
+
 def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
     print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
     if release_id.startswith('release_'):
@@ -104,18 +101,17 @@ def item_to_fileset(item_name: str, release_id: str, session: internetarchive.Ar
     manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
     fileset = {
         'manifest': manifest,
-        'urls': [
-            {
-                'rel': 'archive',
-                'url': f'https://archive.org/download/{item_name}/',
-            },
-        ],
+        'urls': [{
+            'rel': 'archive',
+            'url': f'https://archive.org/download/{item_name}/',
+        }, ],
         'release_ids': [release_id],
         #extra={},
     }
     print(json.dumps(fileset))
     return fileset
 
+
 def main():
     session = internetarchive.get_session()
     if len(sys.argv) == 3:
@@ -133,5 +129,6 @@ def main():
             release_id = fields[1]
             item_to_fileset(item_name, release_id=release_id, session=session)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index 5e33def..aa78aec 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -35,9 +35,7 @@ def run():
     print("Looking up collection: {}".format(collection))
 
     # First fetch list
-    item_list = list(
-        ia.search_items(
-            query="collection:{} mediatype:web".format(collection)))
+    item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
 
     if len(item_list) == 0:
         print("No items found, bailing")
@@ -50,11 +48,12 @@ def run():
         item = item['identifier']
         # TODO: error handling
         try:
-            ret = ia.download(item, files=[item + '.cdx.gz'],
-                verbose=True,
-                destdir=tempdir,
-                no_directory=True,
-                retries=1000)
+            ret = ia.download(item,
+                              files=[item + '.cdx.gz'],
+                              verbose=True,
+                              destdir=tempdir,
+                              no_directory=True,
+                              retries=1000)
             status = ret and status
         except requests.exceptions.ReadTimeout as rt:
             print(str(rt), file=sys.stderr)
@@ -69,14 +68,13 @@ def run():
     # Combine files
     print("Merging and re-compressing all CDX files...")
     #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
-    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
-        shell=True)
+    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True)
 
     # Move and cleanup
-    shutil.move('{}/combined.gz'.format(tempdir),
-                '{}.cdx.gz'.format(collection))
+    shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection))
 
     print("Done!")
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 1b7c85c..4714b60 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an unpaywall dump (JSON) into ingest requests.
 """
@@ -21,7 +20,6 @@ def transform_cnki(obj):
     requests = []
     assert obj['cnki_id']
 
-
     requests = []
     requests.append({
         'base_url': canon(obj['info_url']),
@@ -41,6 +39,7 @@ def transform_cnki(obj):
 
     return requests
 
+
 def transform_wanfang(obj):
 
     assert obj['wanfang_id']
@@ -68,17 +67,18 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="COVID-19 metadata file to use",
-        type=argparse.FileType('r'))
+                        help="COVID-19 metadata file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 62a85e6..3b53235 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -49,7 +49,6 @@ def b32_hex(s):
 
 
 class DeliverDumpGrobidS3():
-
     def __init__(self, s3_bucket, **kwargs):
         self.rstore = None
         self.count = Counter()
@@ -80,11 +79,7 @@ class DeliverDumpGrobidS3():
             tei_xml = tei_xml.encode('utf-8')
             # upload to AWS S3
             obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
+                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
                 Body=tei_xml,
                 StorageClass=self.s3_storage_class,
             )
@@ -92,6 +87,7 @@ class DeliverDumpGrobidS3():
             self.count['success-s3'] += 1
         sys.stderr.write("{}\n".format(self.count))
 
+
 @sentry_client.capture_exceptions
 def main():
 
@@ -121,5 +117,6 @@ def main():
     worker = DeliverDumpGrobidS3(**args.__dict__)
     worker.run(args.dump_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__':  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index ab1906a..ca19b97 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -26,7 +26,6 @@ sentry_client = raven.Client()
 
 
 class DeliverGwbDisk:
-
     def __init__(self, disk_dir, **kwargs):
         self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
         self.rstore = None
@@ -34,7 +33,8 @@ class DeliverGwbDisk:
         # /serve/ instead of /download/ doesn't record view count
         self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))
         self.disk_dir = disk_dir
         self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
         self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
@@ -42,48 +42,56 @@ class DeliverGwbDisk:
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+                                               download_base_url=self.petabox_base_url))
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+            )
         except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".
+                format(ve))
         except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".
+                format(eofe))
         except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+                .format(te))
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
             return None, dict(status="error",
-                reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
+                              reason="archived HTTP response (WARC) was not 200",
+                              warc_status=gwb_record.get_status()[0])
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+                format(ire))
         return raw_content, None
 
     def run(self, manifest_file):
         sys.stderr.write("Ensuring all 65536 base directories exist...\n")
         for i in range(256):
             for j in range(256):
-                fpath = "{}/{}{:02x}/{:02x}".format(
-                        self.disk_dir,
-                        self.disk_prefix,
-                        i,
-                        j)
+                fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
                 os.makedirs(fpath, exist_ok=True)
         sys.stderr.write("Starting...\n")
         for line in manifest_file:
@@ -102,9 +110,11 @@ class DeliverGwbDisk:
                 self.count['skip-warc'] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+                                                   file_cdx['c_size'])
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+                                                         status['reason']))
                 self.count['err-petabox-fetch'] += 1
                 continue
             elif not blob:
@@ -120,19 +130,15 @@ class DeliverGwbDisk:
 
             self.count['petabox-ok'] += 1
             # save to disk
-            fpath = "{}/{}{}/{}/{}{}".format(
-                    self.disk_dir,
-                    self.disk_prefix,
-                    sha1_hex[0:2],
-                    sha1_hex[2:4],
-                    sha1_hex,
-                    self.disk_suffix)
+            fpath = "{}/{}{}/{}/{}{}".format(self.disk_dir, self.disk_prefix, sha1_hex[0:2],
+                                             sha1_hex[2:4], sha1_hex, self.disk_suffix)
             with open(fpath, 'wb') as f:
                 f.write(blob)
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
             self.count['success-disk'] += 1
         sys.stderr.write("{}\n".format(self.count))
 
+
 @sentry_client.capture_exceptions
 def main():
 
@@ -162,5 +168,6 @@ def main():
     worker = DeliverGwbDisk(**args.__dict__)
     worker.run(args.manifest_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__':  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index f103205..f9b3b19 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -53,7 +53,6 @@ sentry_client = raven.Client()
 
 
 class DeliverGwbS3:
-
     def __init__(self, s3_bucket, **kwargs):
         self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
         self.rstore = None
@@ -61,7 +60,8 @@ class DeliverGwbS3:
         # /serve/ instead of /download/ doesn't record view count
         self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+                                                 os.environ.get('PETABOX_WEBDATA_SECRET'))
         self.s3_bucket = s3_bucket
         self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
         self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
@@ -71,37 +71,49 @@ class DeliverGwbS3:
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+                                               download_base_url=self.petabox_base_url))
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+            )
         except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".
+                format(ve))
         except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".
+                format(eofe))
         except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+                .format(te))
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
             return None, dict(status="error",
-                reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
+                              reason="archived HTTP response (WARC) was not 200",
+                              warc_status=gwb_record.get_status()[0])
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+            return None, dict(
+                status="error",
+                reason=
+                "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+                format(ire))
         return raw_content, None
 
     def run(self, manifest_file):
@@ -122,9 +134,11 @@ class DeliverGwbS3:
                 self.count['skip-warc'] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+                                                   file_cdx['c_size'])
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+                                                         status['reason']))
                 self.count['err-petabox-fetch'] += 1
                 continue
             elif not blob:
@@ -140,17 +154,14 @@ class DeliverGwbS3:
 
             self.count['petabox-ok'] += 1
             # upload to AWS S3
-            obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
-                Body=blob)
+            obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4],
+                                                                sha1_hex, self.s3_suffix),
+                                         Body=blob)
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
             self.count['success-s3'] += 1
         sys.stderr.write("{}\n".format(self.count))
 
+
 @sentry_client.capture_exceptions
 def main():
 
@@ -180,5 +191,6 @@ def main():
     worker = DeliverGwbS3(**args.__dict__)
     worker.run(args.manifest_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__':  # pragma: no cover
     main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index 15b30a0..84a2c2c 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an DOAJ article dump (JSON) into ingest requests.
 
@@ -42,22 +41,22 @@ CONTENT_TYPE_MAP = {
     "abstract": [],
     "doc": [],
     "": ["pdf"],
-
     "doi": ["pdf"],
     "url": ["pdf"],
     "fulltext": ["pdf"],
     "anySimpleType": ["pdf"],
-
     "application/pdf": ["pdf"],
     "html": ["html", "pdf"],
     "text/html": ["html", "pdf"],
     "xml": ["xml"],
 }
 
+
 def canon(s: str) -> str:
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj: dict) -> List[dict]:
     """
     Transforms from a single DOAJ object to zero or more ingest requests.
@@ -118,6 +117,7 @@ def transform(obj: dict) -> List[dict]:
 
     return requests
 
+
 def run(args) -> None:
     for l in args.json_file:
         if not l.strip():
@@ -128,17 +128,18 @@ def run(args) -> None:
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="DOAJ article dump file to use",
-        type=argparse.FileType('r'))
+                        help="DOAJ article dump file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 3085346..54c3d5f 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -34,13 +34,13 @@ def run():
 
         sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
 
-        obj = dict(
-            sha1=sha1,
-            dois=dois,
-            cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
-            size=size,
-            mimetype=mimetype)
+        obj = dict(sha1=sha1,
+                   dois=dois,
+                   cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+                   size=size,
+                   mimetype=mimetype)
         print(json.dumps(obj))
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index d0666ce..a474393 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -24,6 +24,7 @@ NAME_DENYLIST = (
     'phdstudent',
 )
 
+
 def tokenize(s, remove_whitespace=True):
 
     s.replace('&apos;', "'")
@@ -36,9 +37,11 @@ def tokenize(s, remove_whitespace=True):
     # Encode as dumb ASCII (TODO: this is horrible)
     return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
 
+
 assert tokenize("Impact Factor: 2.114") == "impactfactor"
 assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
 
+
 def filter_title(title):
 
     title = title.strip()
@@ -83,19 +86,23 @@ def filter_title(title):
 
     return title
 
+
 def filter_author_name(name):
     name = name['name']
     if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
         return None
     return ' '.join([t for t in name.split() if tokenize(t)])
 
+
 def filter_authors(l):
     return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
 
+
 def filter_refs(l):
     # TODO:
     return l
 
+
 def filter_journal_name(name):
     # same denylist, for now
     if not name:
@@ -104,10 +111,12 @@ def filter_journal_name(name):
     slug_name = tokenize(name)
     if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
         return None
-    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ",
+                   "Research Article ", "Available online www.jocpr.com "):
         if name.startswith(prefix):
             name = name.replace(prefix, '')
-    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+    for suffix in (" Available online at www.sciarena.com", " Original Article",
+                   " Available online at", " ISSN", " ISSUE"):
         if name.endswith(suffix):
             name = name.replace(suffix, '')
     if "====================" in name:
@@ -116,6 +125,7 @@ def filter_journal_name(name):
         return None
     return ' '.join(name.split())
 
+
 def filter_metadata(obj):
     if not (obj.get('title') and obj.get('authors')):
         return None
@@ -132,6 +142,7 @@ def filter_metadata(obj):
 
     return obj
 
+
 def run(invert=False):
     for line in sys.stdin:
         fields = line.split('\t')
@@ -155,5 +166,6 @@ def run(invert=False):
         elif invert:
             print(raw.strip())
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index 494da71..fda9098 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -28,6 +28,7 @@ MAX_SLUG_LINES = 50
 
 REQUIRE_AUTHORS = False
 
+
 def tokenize(s, remove_whitespace=False):
 
     s.replace('&apos;', "'")
@@ -40,6 +41,7 @@ def tokenize(s, remove_whitespace=False):
     # Encode as dumb ASCII (TODO: this is horrible)
     return s.encode('ascii', 'replace').replace(b'?', b'')
 
+
 def check_authors(left, right):
     """
     Intended to check GROBID extracted authors (right) against "known good"
@@ -63,6 +65,7 @@ def check_authors(left, right):
             return False
     return True
 
+
 def test_check_authors():
     assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
     assert not check_authors([], ['one'])
@@ -74,6 +77,7 @@ def test_check_authors():
     assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
     assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
 
+
 # Rows are (score, left, right)
 def process_group(rows):
 
@@ -119,6 +123,7 @@ def process_group(rows):
 
     print(json.dumps([releases[ident] for ident in group_ids]))
 
+
 def run():
 
     last_slug = None
@@ -140,5 +145,6 @@ def run():
     if lines:
         process_group(lines)
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index abf81bd..3251852 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -33,6 +33,7 @@ def tokenize(s, remove_whitespace=False):
     # Encode as dumb ASCII (TODO: this is horrible)
     return s.encode('ascii', 'replace').replace(b'?', b'')
 
+
 def check_authors(left, right):
     """
     Intended to check GROBID extracted authors (right) against "known good"
@@ -56,6 +57,7 @@ def check_authors(left, right):
             return False
     return True
 
+
 def test_check_authors():
     assert not check_authors([], [])
     assert not check_authors([], ['one'])
@@ -67,6 +69,7 @@ def test_check_authors():
     assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
     assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
 
+
 # Rows are (score, grobid, crossref)
 def process_group(rows):
     if len(rows) > max_slug_lines:
@@ -92,6 +95,7 @@ def process_group(rows):
     for sha1, doi_list in keepers.items():
         print("{}\t{}".format(sha1, json.dumps(doi_list)))
 
+
 def run():
 
     last_slug = None
@@ -112,5 +116,6 @@ def run():
     if lines:
         process_group(lines)
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index d391f60..b42153c 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
 output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -24,10 +23,12 @@ def parse_hbase(line):
     tei_xml = obj['tei_xml']
     return sha1hex, tei_xml
 
+
 def parse_pg(line):
     obj = json.loads(line)
     return obj['sha1hex'], obj['tei_xml']
 
+
 def run(mode='hbase'):
     for line in sys.stdin:
         if mode == 'hbase':
@@ -49,5 +50,6 @@ def run(mode='hbase'):
             affiliations = [json.loads(a) for a in affiliations]
             print('\t'.join([sha1hex, json.dumps(affiliations)]))
 
-if __name__=='__main__':
+
+if __name__ == '__main__':
     run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 8aee0be..c9bc134 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -4,7 +4,8 @@ import datetime
 import json
 import sys
 
-MAX_ABSTRACT_BYTES=4096
+MAX_ABSTRACT_BYTES = 4096
+
 
 def parse_grobid_json(obj):
 
@@ -14,10 +15,7 @@ def parse_grobid_json(obj):
     extra = dict()
 
     if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
-        abobj = dict(
-            mimetype="text/plain",
-            language=None,
-            content=obj.get('abstract').strip())
+        abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip())
         abstracts = [abobj]
     else:
         abstracts = None
@@ -72,16 +70,16 @@ def parse_grobid_json(obj):
     else:
         extra = None
 
-    return dict(
-        title=obj['title'].strip(),
-        contribs=contribs,
-        publisher=obj['journal'].get('publisher'),
-        volume=obj['journal'].get('volume'),
-        issue=obj['journal'].get('issue'),
-        abstracts=abstracts,
-        release_type=release_type,
-        release_date=release_date,
-        extra=extra)
+    return dict(title=obj['title'].strip(),
+                contribs=contribs,
+                publisher=obj['journal'].get('publisher'),
+                volume=obj['journal'].get('volume'),
+                issue=obj['journal'].get('issue'),
+                abstracts=abstracts,
+                release_type=release_type,
+                release_date=release_date,
+                extra=extra)
+
 
 def run():
     for line in sys.stdin:
@@ -90,5 +88,6 @@ def run():
         if out:
             print(out)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index acba2a8..70731d5 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 This script is used to turn ingest request postgres rows (in JSON export
 format) back in to regular ingest request JSON.
@@ -25,6 +24,7 @@ def transform(row):
         row['fatcat'] = dict(release_ident=extra['release_ident'])
     return row
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -35,17 +35,18 @@ def run(args):
             print(l, file=sys.stderr)
         print(json.dumps(req, sort_keys=True))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="arabesque output file to use",
-        type=argparse.FileType('r'))
+                        help="arabesque output file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 8267003..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -20,6 +20,7 @@ import sys
 # 2. select all file metadata
 # 3. output object
 
+
 def or_none(s):
     if s is None:
         return None
@@ -27,6 +28,7 @@ def or_none(s):
         return None
     return s
 
+
 def process_db(db_path):
 
     db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
         dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
         print(json.dumps(obj))
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 315b8d2..1f4a19f 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an OAI-PMH bulk dump (JSON) into ingest requests.
 
@@ -33,17 +32,19 @@ DOMAIN_BLOCKLIST = [
 ]
 
 RELEASE_STAGE_MAP = {
-    'info:eu-repo/semantics/draftVersion':     'draft',
+    'info:eu-repo/semantics/draftVersion': 'draft',
     'info:eu-repo/semantics/submittedVersion': 'submitted',
-    'info:eu-repo/semantics/acceptedVersion':  'accepted',
+    'info:eu-repo/semantics/acceptedVersion': 'accepted',
     'info:eu-repo/semantics/publishedVersion': 'published',
-    'info:eu-repo/semantics/updatedVersion':   'updated',
+    'info:eu-repo/semantics/updatedVersion': 'updated',
 }
 
+
 def canon(s):
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj):
     """
     Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -112,6 +113,7 @@ def transform(obj):
 
     return requests
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -122,17 +124,18 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="OAI-PMH dump file to use (usually stdin)",
-        type=argparse.FileType('r'))
+                        help="OAI-PMH dump file to use (usually stdin)",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index 71fbe54..3f81b3b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
 
@@ -23,12 +22,14 @@ def run(inpath, outpath):
 
     renderer = poppler.PageRenderer()
     full_page = renderer.render_page(page)
-    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1)
-    img.thumbnail((180,300), Image.BICUBIC)
+    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw',
+                           "BGRA", 0, 1)
+    img.thumbnail((180, 300), Image.BICUBIC)
     #img.thumbnail((360,600), Image.BICUBIC)
     img.save(outpath)
     #img.save(outpath, quality=95)
 
+
 if __name__ == '__main__':
     if len(sys.argv) != 3:
         print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 590b429..b79f316 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Transform an unpaywall dump (JSON) into ingest requests.
 """
@@ -26,17 +25,19 @@ DOMAIN_BLOCKLIST = [
 ]
 
 RELEASE_STAGE_MAP = {
-    'draftVersion':     'draft',
+    'draftVersion': 'draft',
     'submittedVersion': 'submitted',
-    'acceptedVersion':  'accepted',
+    'acceptedVersion': 'accepted',
     'publishedVersion': 'published',
-    'updatedVersion':   'updated',
+    'updatedVersion': 'updated',
 }
 
+
 def canon(s):
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj):
     """
     Transforms from a single unpaywall object to zero or more ingest requests.
@@ -86,6 +87,7 @@ def transform(obj):
 
     return requests
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -96,17 +98,18 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('json_file',
-        help="unpaywall dump file to use",
-        type=argparse.FileType('r'))
+                        help="unpaywall dump file to use",
+                        type=argparse.FileType('r'))
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
+
 if __name__ == '__main__':
     main()
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 7d950df..55636dc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,4 +1,3 @@
-
 import struct
 
 import pytest
@@ -12,20 +11,21 @@ FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
 with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
     REAL_TEI_XML = f.read()
 
+
 @pytest.fixture
 def grobid_client():
-    client = GrobidClient(
-        host_url="http://dummy-grobid",
-    )
+    client = GrobidClient(host_url="http://dummy-grobid", )
     return client
 
+
 @responses.activate
 def test_grobid_503(grobid_client):
 
     status = b'{"status": "done broke due to 503"}'
     responses.add(responses.POST,
-        'http://dummy-grobid/api/processFulltextDocument', status=503,
-        body=status)
+                  'http://dummy-grobid/api/processFulltextDocument',
+                  status=503,
+                  body=status)
 
     resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
 
@@ -35,12 +35,15 @@ def test_grobid_503(grobid_client):
     assert resp['status_code'] == 503
     assert resp['status'] == "error"
 
+
 @responses.activate
 def test_grobid_success(grobid_client):
 
     responses.add(responses.POST,
-        'http://dummy-grobid/api/processFulltextDocument', status=200,
-        body=REAL_TEI_XML, content_type='text/xml')
+                  'http://dummy-grobid/api/processFulltextDocument',
+                  status=200,
+                  body=REAL_TEI_XML,
+                  content_type='text/xml')
 
     resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
 
@@ -53,6 +56,7 @@ def test_grobid_success(grobid_client):
     #print(type(REAL_TEI_XML))
     assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1')
 
+
 @responses.activate
 def test_grobid_worker_cdx(grobid_client, wayback_client):
 
@@ -60,8 +64,10 @@ def test_grobid_worker_cdx(grobid_client, wayback_client):
     worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
 
     responses.add(responses.POST,
-        'http://dummy-grobid/api/processFulltextDocument', status=200,
-        body=REAL_TEI_XML, content_type='text/xml')
+                  'http://dummy-grobid/api/processFulltextDocument',
+                  status=200,
+                  body=REAL_TEI_XML,
+                  content_type='text/xml')
 
     with open('tests/files/example.cdx', 'r') as cdx_file:
         pusher = CdxLinePusher(
@@ -76,4 +82,3 @@ def test_grobid_worker_cdx(grobid_client, wayback_client):
         assert pusher_counts['pushed'] == worker.counts['total']
 
     assert len(responses.calls) == worker.counts['total']
-
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index b8999b1..7637871 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,4 +1,3 @@
-
 import json
 import xml
 
@@ -8,14 +7,15 @@ from grobid2json import *
 
 
 def test_small_xml():
-    
+
     with open('tests/files/small.xml', 'r') as f:
         tei_xml = f.read()
     with open('tests/files/small.json', 'r') as f:
-        json_form  = json.loads(f.read())
+        json_form = json.loads(f.read())
 
     assert teixml2json(tei_xml) == json_form
 
+
 def test_invalid_xml():
 
     with pytest.raises(xml.etree.ElementTree.ParseError):
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index d4bffc1..c5f422e 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,4 +1,3 @@
-
 import json
 
 import pytest
@@ -13,8 +12,7 @@ def test_extract_fulltext_url():
     assert resp == {}
 
     resp = extract_fulltext_url(
-        "http://dummy-site/",
-        b"""<html>
+        "http://dummy-site/", b"""<html>
         <head>
           <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
         </head>
@@ -22,8 +20,7 @@ def test_extract_fulltext_url():
         <h1>my big article here</h1>
         blah
         </body>
-        </html>"""
-    )
+        </html>""")
     assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf"
     assert resp['technique'] == "citation_pdf_url"
 
@@ -32,4 +29,5 @@ def test_extract_fulltext_url():
             "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
             f.read(),
         )
-    assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+    assert resp[
+        'pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index 943e5da..3bf94e2 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,4 +1,3 @@
-
 import datetime
 
 import pytest
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index 7f35d55..a4c1e41 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -1,4 +1,3 @@
-
 import datetime
 
 import pytest
@@ -44,11 +43,12 @@ def test_html_metadata_plos() -> None:
 
 
 def test_html_metadata_elife() -> None:
-    
+
     with open('tests/files/elife_article.html', 'r') as f:
         elife_html = f.read()
 
-    meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
+    meta = html_extract_biblio("https://elifesciences.org/articles/44753",
+                               HTMLParser(elife_html))
     assert meta is not None
     assert meta.title == "Parallel visual circuitry in a basal chordate"
     assert meta.doi == "10.7554/eLife.44753"
@@ -69,7 +69,7 @@ def test_html_metadata_elife() -> None:
 
 
 def test_html_metadata_peerj() -> None:
- 
+
     with open('tests/files/peerj_oa_article.html', 'r') as f:
         peerj_html = f.read()
 
@@ -78,15 +78,15 @@ def test_html_metadata_peerj() -> None:
     assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
     assert meta.doi == "10.7717/peerj.4375"
     assert meta.contrib_names == [
-            "Heather Piwowar",
-      "Jason Priem",
-      "Vincent Larivière",
-      "Juan Pablo Alperin",
-      "Lisa Matthias",
-      "Bree Norlander",
-      "Ashley Farley",
-      "Jevin West",
-      "Stefanie Haustein",
+        "Heather Piwowar",
+        "Jason Priem",
+        "Vincent Larivière",
+        "Juan Pablo Alperin",
+        "Lisa Matthias",
+        "Bree Norlander",
+        "Ashley Farley",
+        "Jevin West",
+        "Stefanie Haustein",
     ]
     assert meta.container_name == "PeerJ"
     # "2018-02-13"
@@ -129,7 +129,7 @@ def test_html_metadata_ojs3() -> None:
         "Os Keyes",
     ]
     assert meta.container_name == "First Monday"
-    assert meta.container_abbrev == "1" # NOTE: bad source metadata
+    assert meta.container_abbrev == "1"  # NOTE: bad source metadata
     assert meta.container_issn == "1396-0466"
     # "2020/09/10"
     assert meta.release_date == datetime.date(year=2020, month=9, day=10)
@@ -150,6 +150,7 @@ def test_html_metadata_dlib() -> None:
     # "2017-05-15"
     assert meta.release_date == datetime.date(year=2017, month=5, day=15)
 
+
 def test_html_metadata_dc_case() -> None:
     """
     This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
@@ -167,10 +168,12 @@ def test_html_metadata_dc_case() -> None:
     assert meta is not None
     assert meta.issue == "123"
 
+
 @pytest.fixture
 def adblock() -> Any:
     return load_adblock_rules()
 
+
 def test_html_resources(adblock) -> None:
 
     with open('tests/files/dlib_05vanhyning.html', 'r') as f:
@@ -227,4 +230,3 @@ def test_html_resources(adblock) -> None:
         HTMLParser(nature_html),
         adblock,
     )
-
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 0965fcb..79f50f4 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,4 +1,3 @@
-
 import json
 
 import pytest
@@ -12,9 +11,7 @@ from sandcrawler import *
 
 @pytest.fixture
 def ingest_worker(wayback_client, spn_client):
-    grobid_client = GrobidClient(
-        host_url="http://dummy-grobid",
-    )
+    grobid_client = GrobidClient(host_url="http://dummy-grobid", )
     worker = IngestFileWorker(
         wayback_client=wayback_client,
         spn_client=spn_client,
@@ -22,14 +19,11 @@ def ingest_worker(wayback_client, spn_client):
     )
     return worker
 
+
 @pytest.fixture
 def ingest_worker_pdf(wayback_client_pdf, spn_client):
-    grobid_client = GrobidClient(
-        host_url="http://dummy-grobid",
-    )
-    pgrest_client = SandcrawlerPostgrestClient(
-        api_url="http://dummy-postgrest",
-    )
+    grobid_client = GrobidClient(host_url="http://dummy-grobid", )
+    pgrest_client = SandcrawlerPostgrestClient(api_url="http://dummy-postgrest", )
     worker = IngestFileWorker(
         wayback_client=wayback_client_pdf,
         spn_client=spn_client,
@@ -50,37 +44,45 @@ def test_ingest_success(ingest_worker_pdf):
         'base_url': "http://dummy-host/",
     }
     responses.add(responses.POST,
-        'http://dummy-spnv2/save',
-        status=200,
-        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+                  'http://dummy-spnv2/save',
+                  status=200,
+                  body=json.dumps({
+                      "url": TARGET,
+                      "job_id": JOB_ID
+                  }))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(PENDING_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(PENDING_BODY))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(SUCCESS_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(SUCCESS_BODY))
     responses.add(responses.GET,
-        'http://dummy-cdx/cdx',
-        status=200,
-        body=json.dumps(CDX_SPN_HIT))
+                  'http://dummy-cdx/cdx',
+                  status=200,
+                  body=json.dumps(CDX_SPN_HIT))
     responses.add(responses.GET,
-        'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
-        status=200,
-        headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
-        body=pdf_bytes)
+                  'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+                                                                TARGET + "/redirect"),
+                  status=200,
+                  headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+                  body=pdf_bytes)
     responses.add(responses.GET,
-        'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
-        status=200,
-        body=json.dumps([]))
+                  'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format(
+                      "90ffd2359008d82298821d16b21778c5c39aec36"),
+                  status=200,
+                  body=json.dumps([]))
     responses.add(responses.GET,
-        'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
-        status=200,
-        body=json.dumps([]))
+                  'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format(
+                      "90ffd2359008d82298821d16b21778c5c39aec36"),
+                  status=200,
+                  body=json.dumps([]))
     responses.add(responses.POST,
-        'http://dummy-grobid/api/processFulltextDocument', status=200,
-        body=REAL_TEI_XML, content_type='text/xml')
+                  'http://dummy-grobid/api/processFulltextDocument',
+                  status=200,
+                  body=REAL_TEI_XML,
+                  content_type='text/xml')
 
     resp = ingest_worker_pdf.process(request)
 
@@ -108,6 +110,7 @@ def test_ingest_success(ingest_worker_pdf):
     assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
     assert resp['pdf_meta'].get('text') is None
 
+
 @responses.activate
 def test_ingest_landing(ingest_worker):
 
@@ -116,34 +119,39 @@ def test_ingest_landing(ingest_worker):
         'base_url': "http://dummy-host/",
     }
     responses.add(responses.POST,
-        'http://dummy-spnv2/save',
-        status=200,
-        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+                  'http://dummy-spnv2/save',
+                  status=200,
+                  body=json.dumps({
+                      "url": TARGET,
+                      "job_id": JOB_ID
+                  }))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(PENDING_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(PENDING_BODY))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(SUCCESS_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(SUCCESS_BODY))
     responses.add(responses.GET,
-        'http://dummy-cdx/cdx',
-        status=200,
-        body=json.dumps(CDX_SPN_HIT))
+                  'http://dummy-cdx/cdx',
+                  status=200,
+                  body=json.dumps(CDX_SPN_HIT))
     responses.add(responses.GET,
-        'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
-        status=200,
-        headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
-        body=WARC_BODY)
+                  'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+                                                                TARGET + "/redirect"),
+                  status=200,
+                  headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+                  body=WARC_BODY)
 
     # this is for second time around; don't want to fetch same landing page
     # HTML again and result in a loop
     responses.add(responses.GET,
-        'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
-        status=200,
-        headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
-        body="<html></html>")
+                  'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+                                                                TARGET + "/redirect"),
+                  status=200,
+                  headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+                  body="<html></html>")
 
     resp = ingest_worker.process(request)
 
@@ -157,6 +165,7 @@ def test_ingest_landing(ingest_worker):
     assert 'revisit_cdx' not in resp
     assert 'grobid' not in resp
 
+
 @responses.activate
 def test_ingest_blocklist(ingest_worker):
 
@@ -192,6 +201,7 @@ def test_ingest_wall_blocklist(ingest_worker):
     assert resp['status'] == "skip-wall"
     assert resp['request'] == request
 
+
 @responses.activate
 def test_ingest_cookie_blocklist(ingest_worker):
 
@@ -205,4 +215,3 @@ def test_ingest_cookie_blocklist(ingest_worker):
     assert resp['hit'] == False
     assert resp['status'] == "blocked-cookie"
     assert resp['request'] == request
-
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index b501dc3..0ff4902 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -1,4 +1,3 @@
-
 """
 This file contains tests to run against "live" wayback services. They default
 to "skip" because you need authentication, and we shouldn't hit these services
@@ -11,8 +10,8 @@ import json
 
 import pytest
 
-from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, SavePageNowError,
-                         WaybackClient, WaybackError, gen_file_metadata)
+from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient,
+                         SavePageNowError, WaybackClient, WaybackError, gen_file_metadata)
 
 
 @pytest.fixture
@@ -20,16 +19,19 @@ def cdx_client():
     client = CdxApiClient()
     return client
 
+
 @pytest.fixture
 def wayback_client():
     client = WaybackClient()
     return client
 
+
 @pytest.fixture
 def spn_client():
     client = SavePageNowClient()
     return client
 
+
 @pytest.mark.skip(reason="hits prod services, requires auth")
 def test_cdx_fetch(cdx_client):
 
@@ -50,6 +52,7 @@ def test_cdx_fetch(cdx_client):
     with pytest.raises(KeyError):
         resp = cdx_client.fetch(url, "12345678123456")
 
+
 @pytest.mark.skip(reason="hits prod services, requires auth")
 def test_cdx_lookup_best(cdx_client):
 
@@ -68,13 +71,18 @@ def test_cdx_lookup_best(cdx_client):
     assert resp.mimetype == "text/html"
     assert resp.status_code == 200
 
+
 @pytest.mark.skip(reason="hits prod services, requires auth")
 def test_wayback_fetch(wayback_client):
 
-    resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz")
+    resp = wayback_client.fetch_petabox(
+        25683, 2676464871,
+        "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz"
+    )
 
     assert resp.body
 
+
 @pytest.mark.skip(reason="hits prod services, requires auth")
 def test_lookup_resource_success(wayback_client):
 
@@ -86,6 +94,7 @@ def test_lookup_resource_success(wayback_client):
     assert resp.terminal_url in (url, url.replace("https://", "http://"))
     assert resp.cdx.url in (url, url.replace("https://", "http://"))
 
+
 @pytest.mark.skip(reason="hits prod services, requires auth")
 def test_cdx_fetch_spn2(cdx_client):
 
@@ -107,8 +116,8 @@ def test_cdx_fetch_spn2(cdx_client):
     # https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
 
     #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+    #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+    #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
 
     url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
     datetime = "20200110222410"
@@ -119,6 +128,7 @@ def test_cdx_fetch_spn2(cdx_client):
     assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
     assert resp.status_code == 200
 
+
 @pytest.mark.skip(reason="hits prod services, requires auth")
 def test_lookup_ftp(wayback_client):
     # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
@@ -153,6 +163,7 @@ def test_lookup_ftp(wayback_client):
     file_meta = gen_file_metadata(resp.body)
     assert file_meta['sha1hex'] == resp.cdx.sha1hex
 
+
 @pytest.mark.skip(reason="hits prod services, requires auth")
 def test_crawl_ftp(spn_client, wayback_client):
 
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 0788c38..dcc1202 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,11 +1,11 @@
-
 import pytest
 
-from sandcrawler import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_line
+from sandcrawler import (b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path,
+                         parse_cdx_line)
 
 
 def test_gen_file_metadata():
-    
+
     # valid (but very small) PDF file
     with open('tests/files/dummy.pdf', 'rb') as f:
         file_meta = gen_file_metadata(f.read())
@@ -27,8 +27,9 @@ def test_gen_file_metadata():
     assert fm['mimetype'] == 'text/plain'
     assert fm['size_bytes'] == 8
 
+
 def test_gen_file_metadata_path():
-    
+
     # valid (but very small) PDF file
     file_meta = gen_file_metadata_path('tests/files/dummy.pdf')
     assert file_meta == {
@@ -39,11 +40,14 @@ def test_gen_file_metadata_path():
         'size_bytes': 13264,
     }
 
+
 def test_b32_hex():
 
     # valid b32
-    assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
-    assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+    assert b32_hex(
+        'sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+    assert b32_hex(
+        'TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
 
     # sha1hex pass-through
     s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
@@ -53,6 +57,7 @@ def test_b32_hex():
     with pytest.raises(ValueError):
         assert b32_hex('blah') == 'blah'
 
+
 def test_parse_cdx_line():
 
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
@@ -73,6 +78,7 @@ def test_parse_cdx_line():
     assert parse_cdx_line(raw + "\n") == correct
     assert parse_cdx_line(raw + " extra_field") == correct
 
+
 def test_invalid_cdx():
 
     print("missing warc")
@@ -80,11 +86,11 @@ def test_invalid_cdx():
     assert parse_cdx_line(raw) == None
 
     print("bad datetime")
-    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" 
+    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
     assert parse_cdx_line(raw) == None
 
+
 def test_clean_url():
     assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
     assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
         "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
-
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 1d334d6..146b138 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -1,4 +1,3 @@
-
 import struct
 
 import poppler
@@ -6,11 +5,13 @@ import pytest
 import responses
 from test_wayback import cdx_client, wayback_client
 
-from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, WaybackClient
+from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker,
+                         WaybackClient)
 from sandcrawler.pdfextract import process_pdf
 
 FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
 
+
 def test_process_fake_pdf():
     resp = process_pdf(FAKE_PDF_BYTES)
     print(resp)
@@ -21,7 +22,9 @@ def test_process_fake_pdf():
     resp = process_pdf(pdf_bytes)
     assert resp.status == 'not-pdf'
 
-@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler")
+
+@pytest.mark.skipif(poppler.version_string() == '0.71.0',
+                    reason="unsupported version of poppler")
 def test_process_dummy_pdf():
     with open('tests/files/dummy.pdf', 'rb') as f:
         pdf_bytes = f.read()
@@ -39,6 +42,7 @@ def test_process_dummy_pdf():
     assert resp.pdf_extra['page0_width'] == 595
     assert resp.pdf_extra['page_count'] == 1
 
+
 def test_pdfextract_worker_cdx(wayback_client):
 
     sink = BlackholeSink()
@@ -56,6 +60,7 @@ def test_pdfextract_worker_cdx(wayback_client):
         assert pusher_counts['pushed'] == 7
         assert pusher_counts['pushed'] == worker.counts['total']
 
+
 def test_pdfextract_blob_worker():
 
     sink = BlackholeSink()
@@ -65,4 +70,3 @@ def test_pdfextract_blob_worker():
         pdf_bytes = f.read()
 
     worker.process(pdf_bytes)
-
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 62fa515..63f90d3 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,4 +1,3 @@
-
 import pytest
 
 from sandcrawler.workers import BlackholeSink, CdxLinePusher
@@ -18,8 +17,10 @@ def test_cdx_line_pusher():
 
     # HTTP 200 and application/pdf
     with open('tests/files/example.cdx', 'r') as cdx_file:
-        pusher = CdxLinePusher(sink, cdx_file,
-            filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
+        pusher = CdxLinePusher(sink,
+                               cdx_file,
+                               filter_mimetypes=['application/pdf'],
+                               filter_http_statuses=[200, 226])
         counts = pusher.run()
     assert counts['total'] == 20
     assert counts['skip-parse'] == 1
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index f3fbfda..80334d9 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -1,4 +1,3 @@
-
 import json
 
 import pytest
@@ -26,9 +25,7 @@ SUCCESS_BODY = {
     "timestamp": "20180326070330",
     "duration_sec": 6.203,
     "resources": [
-        TARGET,
-        TARGET + "/redirect",
-        "http://brewster.kahle.org/",
+        TARGET, TARGET + "/redirect", "http://brewster.kahle.org/",
         "http://brewster.kahle.org/favicon.ico",
         "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg",
         "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg",
@@ -43,8 +40,7 @@ SUCCESS_BODY = {
         "http://brewster.kahle.org/wp-content/themes/twentyten/style.css",
         "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4",
         "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
-        "http://platform.twitter.com/widgets.js",
-        "https://archive-it.org/piwik.js",
+        "http://platform.twitter.com/widgets.js", "https://archive-it.org/piwik.js",
         "https://platform.twitter.com/jot.html",
         "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js",
         "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html",
@@ -60,7 +56,7 @@ SUCCESS_BODY = {
         "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
         "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
     ],
-    "outlinks":{
+    "outlinks": {
         "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
         "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
     }
@@ -74,10 +70,18 @@ ERROR_BODY = {
     "resources": []
 }
 CDX_SPN_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"],
+    [
+        "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+        "robotflags", "length", "offset", "filename"
+    ],
+    [
+        "wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200",
+        CDX_BEST_SHA1B32, "-", "-", "8445", "108062304",
+        "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"
+    ],
 ]
 
+
 @pytest.fixture
 def spn_client():
     client = SavePageNowClient(
@@ -88,25 +92,29 @@ def spn_client():
     client.poll_seconds = 0.0
     return client
 
+
 @responses.activate
 def test_savepagenow_success(spn_client):
 
     responses.add(responses.POST,
-        'http://dummy-spnv2/save',
-        status=200,
-        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+                  'http://dummy-spnv2/save',
+                  status=200,
+                  body=json.dumps({
+                      "url": TARGET,
+                      "job_id": JOB_ID
+                  }))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(PENDING_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(PENDING_BODY))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(PENDING_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(PENDING_BODY))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(SUCCESS_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(SUCCESS_BODY))
 
     resp = spn_client.save_url_now_v2(TARGET)
 
@@ -119,21 +127,25 @@ def test_savepagenow_success(spn_client):
     assert resp.terminal_dt == SUCCESS_BODY['timestamp']
     assert resp.resources == SUCCESS_BODY['resources']
 
+
 @responses.activate
 def test_savepagenow_remote_error(spn_client):
 
     responses.add(responses.POST,
-        'http://dummy-spnv2/save',
-        status=200,
-        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+                  'http://dummy-spnv2/save',
+                  status=200,
+                  body=json.dumps({
+                      "url": TARGET,
+                      "job_id": JOB_ID
+                  }))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(PENDING_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(PENDING_BODY))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(ERROR_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(ERROR_BODY))
 
     resp = spn_client.save_url_now_v2(TARGET)
 
@@ -146,47 +158,56 @@ def test_savepagenow_remote_error(spn_client):
     assert resp.terminal_dt == None
     assert resp.resources == None
 
+
 @responses.activate
 def test_savepagenow_500(spn_client):
 
     responses.add(responses.POST,
-        'http://dummy-spnv2/save',
-        status=200,
-        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+                  'http://dummy-spnv2/save',
+                  status=200,
+                  body=json.dumps({
+                      "url": TARGET,
+                      "job_id": JOB_ID
+                  }))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=500,
-        body=json.dumps(ERROR_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=500,
+                  body=json.dumps(ERROR_BODY))
 
     with pytest.raises(SavePageNowError):
         resp = spn_client.save_url_now_v2(TARGET)
 
     assert len(responses.calls) == 2
 
+
 @responses.activate
 def test_crawl_resource(spn_client, wayback_client):
 
     responses.add(responses.POST,
-        'http://dummy-spnv2/save',
-        status=200,
-        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+                  'http://dummy-spnv2/save',
+                  status=200,
+                  body=json.dumps({
+                      "url": TARGET,
+                      "job_id": JOB_ID
+                  }))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(PENDING_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(PENDING_BODY))
     responses.add(responses.GET,
-        'http://dummy-spnv2/save/status/' + JOB_ID,
-        status=200,
-        body=json.dumps(SUCCESS_BODY))
+                  'http://dummy-spnv2/save/status/' + JOB_ID,
+                  status=200,
+                  body=json.dumps(SUCCESS_BODY))
     responses.add(responses.GET,
-        'http://dummy-cdx/cdx',
-        status=200,
-        body=json.dumps(CDX_SPN_HIT))
+                  'http://dummy-cdx/cdx',
+                  status=200,
+                  body=json.dumps(CDX_SPN_HIT))
     responses.add(responses.GET,
-        'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
-        status=200,
-        headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
-        body=WARC_BODY)
+                  'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+                                                                TARGET + "/redirect"),
+                  status=200,
+                  headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+                  body=WARC_BODY)
 
     print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"))
     resp = spn_client.crawl_resource(TARGET, wayback_client)
@@ -201,4 +222,3 @@ def test_crawl_resource(spn_client, wayback_client):
     assert type(resp.cdx) == CdxPartial
     with pytest.raises(AttributeError):
         print(resp.cdx.warc_path)
-
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 83311b9..6ccf775 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,4 +1,3 @@
-
 import json
 
 import pytest
@@ -10,27 +9,66 @@ CDX_TARGET = "http://fatcat.wiki/"
 CDX_DT = "20180812220054"
 # cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
 CDX_SINGLE_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+    [
+        "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+        "robotflags", "length", "offset", "filename"
+    ],
+    [
+        "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
+        "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
 ]
 
 CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
 # cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
 CDX_MULTI_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner, but not right mimetype
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner and mimetype, but wrong status code
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # "best"
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # older
- ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+    [
+        "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+        "robotflags", "length", "offset", "filename"
+    ],
+    [
+        "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
+        "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
+    # sooner, but not right mimetype
+    [
+        "wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200",
+        "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
+    # sooner and mimetype, but wrong status code
+    [
+        "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400",
+        "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
+    [
+        "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500",
+        "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
+    [
+        "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150",
+        "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
+    # "best"
+    [
+        "wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-",
+        "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
+    # older
+    [
+        "wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200",
+        "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+        "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+    ],
 ]
 
+
 @pytest.fixture
 def cdx_client():
     client = CdxApiClient(
@@ -39,13 +77,14 @@ def cdx_client():
     )
     return client
 
+
 @responses.activate
 def test_cdx_fetch(cdx_client):
 
     responses.add(responses.GET,
-        'http://dummy-cdx/cdx',
-        status=200,
-        body=json.dumps(CDX_SINGLE_HIT))
+                  'http://dummy-cdx/cdx',
+                  status=200,
+                  body=json.dumps(CDX_SINGLE_HIT))
 
     resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
 
@@ -58,6 +97,7 @@ def test_cdx_fetch(cdx_client):
     assert resp.warc_offset == 108062304
     assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
 
+
 @responses.activate
 def test_cdx_fetch_errors(cdx_client):
 
@@ -65,9 +105,9 @@ def test_cdx_fetch_errors(cdx_client):
         resp = cdx_client.fetch(CDX_TARGET, "2019")
 
     responses.add(responses.GET,
-        'http://dummy-cdx/cdx',
-        status=200,
-        body=json.dumps(CDX_SINGLE_HIT))
+                  'http://dummy-cdx/cdx',
+                  status=200,
+                  body=json.dumps(CDX_SINGLE_HIT))
 
     with pytest.raises(KeyError):
         resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
@@ -78,13 +118,14 @@ def test_cdx_fetch_errors(cdx_client):
     resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
     assert len(responses.calls) == 3
 
+
 @responses.activate
 def test_cdx_lookup_best(cdx_client):
 
     responses.add(responses.GET,
-        'http://dummy-cdx/cdx',
-        status=200,
-        body=json.dumps(CDX_MULTI_HIT))
+                  'http://dummy-cdx/cdx',
+                  status=200,
+                  body=json.dumps(CDX_MULTI_HIT))
 
     resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
 
@@ -95,6 +136,7 @@ def test_cdx_lookup_best(cdx_client):
     assert resp.sha1b32 == CDX_BEST_SHA1B32
     assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
 
+
 WARC_TARGET = "http://fatcat.wiki/"
 WARC_BODY = b"""
 <html>
@@ -108,6 +150,7 @@ WARC_BODY = b"""
 </html>
 """
 
+
 @pytest.fixture
 def wayback_client(cdx_client, mocker):
     client = WaybackClient(
@@ -127,6 +170,7 @@ def wayback_client(cdx_client, mocker):
 
     return client
 
+
 @pytest.fixture
 def wayback_client_pdf(cdx_client, mocker):
 
@@ -150,6 +194,7 @@ def wayback_client_pdf(cdx_client, mocker):
 
     return client
 
+
 @responses.activate
 def test_wayback_fetch(wayback_client):
     resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
@@ -159,13 +204,14 @@ def test_wayback_fetch(wayback_client):
     resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
     assert resp == WARC_BODY
 
+
 @responses.activate
 def test_lookup_resource_success(wayback_client):
 
     responses.add(responses.GET,
-        'http://dummy-cdx/cdx',
-        status=200,
-        body=json.dumps(CDX_MULTI_HIT))
+                  'http://dummy-cdx/cdx',
+                  status=200,
+                  body=json.dumps(CDX_MULTI_HIT))
 
     resp = wayback_client.lookup_resource(CDX_TARGET)
 
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
index a996c56..1742f3a 100644
--- a/python/tests/test_xml.py
+++ b/python/tests/test_xml.py
@@ -1,11 +1,10 @@
-
 import pytest
 
 from sandcrawler.xml import xml_reserialize
 
 
 def test_xml_reserialize() -> None:
-    
+
     with open('tests/files/scielo_article.jats.xml', 'rb') as f:
         raw_xml = f.read()