20 files changed, 1222 insertions, 526 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 03a1f29..4561541 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 This script is intended to be used for backfill ingest of old crawls. It can
 also be used as a fast path for getting freshly crawled content into fatcat if
@@ -12,9 +11,9 @@ Run like:
 Can then run through requests using that tool, or dump into kafka queue.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
 
 
 def run(args):
@@ -22,51 +21,54 @@ def run(args):
         if not l.strip():
             continue
         row = json.loads(l)
-        if not row['hit']:
+        if not row["hit"]:
             continue
 
         request = {
-            'base_url': row['final_url'],
-            'ingest_type': args.ingest_type,
-            'link_source': args.link_source,
-            'link_source_id': row['identifier'],
-            'ingest_request_source': args.ingest_request_source,
-            'ext_ids': {
-                args.extid_type: row['identifier'],
+            "base_url": row["final_url"],
+            "ingest_type": args.ingest_type,
+            "link_source": args.link_source,
+            "link_source_id": row["identifier"],
+            "ingest_request_source": args.ingest_request_source,
+            "ext_ids": {
+                args.extid_type: row["identifier"],
             },
         }
         if args.release_stage:
-            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
-            request['release_stage'] = args.release_stage
+            assert args.release_stage in (
+                "published",
+                "submitted",
+                "accepted",
+                "draft",
+                "update",
+            )
+            request["release_stage"] = args.release_stage
 
         print("{}".format(json.dumps(request, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--link-source',
-        required=True,
-        help="link_source to include in request")
-    parser.add_argument('--extid-type',
-        required=True,
-        help="extid to encode identifier as")
-    parser.add_argument('--ingest-type',
-        default="pdf",
-        help="ingest type (pdf, html, xml, etc)")
-    parser.add_argument('--ingest-request-source',
-        default="arabesque",
-        help="to include in request")
-    parser.add_argument('--release-stage',
-        default=None,
-        help="to include in request")
-    parser.add_argument('json_file',
-        help="arabesque output file to use",
-        type=argparse.FileType('r'))
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--link-source", required=True, help="link_source to include in request"
+    )
+    parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
+    parser.add_argument(
+        "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
+    )
+    parser.add_argument(
+        "--ingest-request-source", default="arabesque", help="to include in request"
+    )
+    parser.add_argument("--release-stage", default=None, help="to include in request")
+    parser.add_argument(
+        "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
new file mode 100755
index 0000000..6328f52
--- /dev/null
+++ b/python/scripts/archiveorg_fileset.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+Helper script to 
+
+Takes either two args (release ident and archive.org item), or a stream of
+tab-separated such pairs on stdin.
+
+TODO:
+- should this check the item type?
+"""
+
+import json
+import sys
+from typing import Any
+
+import internetarchive
+
+FORMAT_TO_MIMETYPE = {
+    "BZIP": "application/x-bzip",
+    "BZIP2": "application/x-bzip2",
+    "ZIP": "application/zip",
+    "GZIP": "application/gzip",
+    "RAR": "application/vnd.rar",
+    "TAR": "application/x-tar",
+    "7z": "application/x-7z-compressed",
+    "HTML": "text/html",
+    "Text": "text/plain",
+    "PDF": "application/pdf",
+    "CSV": "text/csv",
+    "XML": "application/xml",
+    "JSON": "application/json",
+    #'application/msword (.doc)', # .doc
+    #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+    #'application/vnd.ms-excel', # .xls
+    #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+    "MP3": "audio/mpeg",  # .mp3
+    "MP4": "video/mp4",  # .mp4
+    "MPEG": "video/mpeg",  # .mpeg
+    "JPEG": "image/jpeg",
+    "GIF": "image/gif",
+    "PNG": "image/png",
+    "TIFF": "image/tiff",
+    "Unknown": None,
+}
+
+
+def want_file(f: dict, item_name: str) -> bool:
+    """
+    Filters IA API files
+    """
+    if f.source != "original":
+        return False
+    for suffix in [
+        "_meta.sqlite",
+        "_archive.torrent",
+        "_itemimage.jpg",
+        "_meta.xml",
+        "_thumb.png",
+        "_files.xml",
+    ]:
+        if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+            return False
+    if f.name.startswith("_"):
+        return False
+    if item_name.startswith("academictorrents_"):
+        for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
+            if f.name == item_name + suffix:
+                return False
+    return True
+
+
+def parse_file(f: dict) -> dict:
+    """
+    Takes an IA API file and turns it in to a fatcat fileset manifest file
+    """
+    assert f.name and f.sha1 and f.md5
+    assert f.name is not None
+    mf = {
+        "path": f.name,
+        "size": int(f.size),
+        "sha1": f.sha1,
+        "md5": f.md5,
+    }
+    # TODO: will disable this hard check eventually and replace with:
+    # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+    mimetype = FORMAT_TO_MIMETYPE[f.format]
+    if mimetype:
+        mf["extra"] = dict(mimetype=mimetype)
+    return mf
+
+
+def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
+    print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
+    if release_id.startswith("release_"):
+        release_id = release_id[9:]
+    assert len(release_id) == 26
+    item = session.get_item(item_name)
+    assert item.metadata["mediatype"] not in ["collection", "web"]
+    item_files = item.get_files(on_the_fly=False)
+    manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
+    fileset = {
+        "manifest": manifest,
+        "urls": [
+            {
+                "rel": "archive",
+                "url": f"https://archive.org/download/{item_name}/",
+            },
+        ],
+        "release_ids": [release_id],
+        # extra={},
+    }
+    print(json.dumps(fileset))
+    return fileset
+
+
+def main():
+    session = internetarchive.get_session()
+    if len(sys.argv) == 3:
+        item_name = sys.argv[1]
+        release_id = sys.argv[2]
+        item_to_fileset(item_name, release_id=release_id, session=session)
+    else:
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+            fields = line.split("\t")
+            assert len(fields) == 2
+            item_name = fields[0]
+            release_id = fields[1]
+            item_to_fileset(item_name, release_id=release_id, session=session)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..0b60da3
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+    ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import internetarchive as ia
+import requests
+
+
+def run():
+
+    if len(sys.argv) != 2:
+        print("Expected a single argument (collection name)")
+        sys.exit(-1)
+
+    collection = sys.argv[1]
+
+    # Check collection name is clean
+    assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
+
+    tempdir = tempfile.mkdtemp()
+    print("Looking up collection: {}".format(collection))
+
+    # First fetch list
+    item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
+
+    if len(item_list) == 0:
+        print("No items found, bailing")
+        sys.exit(-1)
+
+    print("Found {} potential items".format(len(item_list)))
+    status = True
+    errors = []
+    for item in item_list:
+        item = item["identifier"]
+        # TODO: error handling
+        try:
+            ret = ia.download(
+                item,
+                files=[item + ".cdx.gz"],
+                verbose=True,
+                destdir=tempdir,
+                no_directory=True,
+                retries=1000,
+            )
+            status = ret and status
+        except requests.exceptions.ReadTimeout as rt:
+            print(str(rt), file=sys.stderr)
+            errors.append(rt)
+            continue
+
+    if errors:
+        print("## Download Errors", file=sys.stderr)
+        for e in errors:
+            print(e, file=sys.stderr)
+
+    # Combine files
+    print("Merging and re-compressing all CDX files...")
+    # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+    subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
+
+    # Move and cleanup
+    shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 33c425d..e3bf4f0 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python3
-
 """
 Transform an unpaywall dump (JSON) into ingest requests.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+
 import urlcanon
 
 
@@ -18,38 +18,44 @@ def canon(s):
 def transform_cnki(obj):
 
     requests = []
-    assert obj['cnki_id']
-
+    assert obj["cnki_id"]
 
     requests = []
-    requests.append({
-        'base_url': canon(obj['info_url']),
-        'ingest_type': 'pdf',
-        'link_source': 'cnki_covid19',
-        'link_source_id': obj['cnki_id'],
-        'ingest_request_source': 'scrape-covid19',
-    })
-    if 'read_url' in obj:
-        requests.append({
-            'base_url': canon(obj['read_url']),
-            'ingest_type': 'pdf',  # actually HTML
-            'link_source': 'cnki_covid19',
-            'link_source_id': obj['cnki_id'],
-            'ingest_request_source': 'scrape-covid19',
-        })
+    requests.append(
+        {
+            "base_url": canon(obj["info_url"]),
+            "ingest_type": "pdf",
+            "link_source": "cnki_covid19",
+            "link_source_id": obj["cnki_id"],
+            "ingest_request_source": "scrape-covid19",
+        }
+    )
+    if "read_url" in obj:
+        requests.append(
+            {
+                "base_url": canon(obj["read_url"]),
+                "ingest_type": "pdf",  # actually HTML
+                "link_source": "cnki_covid19",
+                "link_source_id": obj["cnki_id"],
+                "ingest_request_source": "scrape-covid19",
+            }
+        )
 
     return requests
 
+
 def transform_wanfang(obj):
 
-    assert obj['wanfang_id']
-    return [{
-        'base_url': canon(obj['url']),
-        'ingest_type': 'pdf',
-        'link_source': 'wanfang_covid19',
-        'link_source_id': obj['wanfang_id'],
-        'ingest_request_source': 'scrape-covid19',
-    }]
+    assert obj["wanfang_id"]
+    return [
+        {
+            "base_url": canon(obj["url"]),
+            "ingest_type": "pdf",
+            "link_source": "wanfang_covid19",
+            "link_source_id": obj["wanfang_id"],
+            "ingest_request_source": "scrape-covid19",
+        }
+    ]
 
 
 def run(args):
@@ -58,26 +64,27 @@ def run(args):
             continue
         row = json.loads(l)
 
-        if 'wanfang_id' in row:
+        if "wanfang_id" in row:
             requests = transform_wanfang(row) or []
-        elif 'cnki_id' in row:
+        elif "cnki_id" in row:
             requests = transform_cnki(row) or []
         else:
             continue
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-        help="COVID-19 metadata file to use",
-        type=argparse.FileType('r'))
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..27ccf21 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,23 +19,20 @@ Output:
 - log to stdout (redirect to file), prefixed by sha1
 
 Requires:
-- raven (sentry)
+- sentry-sdk
 - boto3 (AWS S3 client library)
 """
 
-import os
-import sys
-import json
+import argparse
 import base64
 import hashlib
-import argparse
+import json
+import os
+import sys
 from collections import Counter
 
 import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
 
 
 def b32_hex(s):
@@ -45,81 +42,80 @@ def b32_hex(s):
         s = s[5:]
     if len(s) != 32:
         return s
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
 
-class DeliverDumpGrobidS3():
 
+class DeliverDumpGrobidS3:
     def __init__(self, s3_bucket, **kwargs):
         self.rstore = None
         self.count = Counter()
         self.s3_bucket = s3_bucket
-        self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
-        self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
-        self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
-        self.s3 = boto3.resource('s3')
+        self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+        self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+        self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+        self.s3 = boto3.resource("s3")
         self.bucket = self.s3.Bucket(self.s3_bucket)
 
     def run(self, dump_file):
         sys.stderr.write("Starting...\n")
         for line in dump_file:
-            line = line.strip().split('\t')
+            line = line.strip().split("\t")
             if len(line) != 2:
-                self.count['skip-line'] += 1
+                self.count["skip-line"] += 1
                 continue
             sha1_hex, grobid_json = line[0], line[1]
             if len(sha1_hex) != 40:
                 sha1_hex = b32_hex(sha1_hex)
             assert len(sha1_hex) == 40
             grobid = json.loads(grobid_json)
-            tei_xml = grobid.get('tei_xml')
+            tei_xml = grobid.get("tei_xml")
             if not tei_xml:
                 print("{}\tskip empty".format(sha1_hex))
-                self.count['skip-empty'] += 1
+                self.count["skip-empty"] += 1
                 continue
-            tei_xml = tei_xml.encode('utf-8')
+            tei_xml = tei_xml.encode("utf-8")
             # upload to AWS S3
             obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
+                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
                 Body=tei_xml,
                 StorageClass=self.s3_storage_class,
             )
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
-            self.count['success-s3'] += 1
+            self.count["success-s3"] += 1
         sys.stderr.write("{}\n".format(self.count))
 
-@sentry_client.capture_exceptions
+
 def main():
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--s3-bucket',
-                        required=True,
-                        type=str,
-                        help='AWS S3 bucket to upload into')
-    parser.add_argument('--s3-prefix',
-                        type=str,
-                        default="grobid/",
-                        help='key prefix for items created in bucket')
-    parser.add_argument('--s3-suffix',
-                        type=str,
-                        default=".tei.xml",
-                        help='file suffix for created objects')
-    parser.add_argument('--s3-storage-class',
-                        type=str,
-                        default="STANDARD",
-                        help='AWS S3 storage class (redundancy) to use')
-    parser.add_argument('dump_file',
-                        help="TSV/JSON dump file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+    )
+    parser.add_argument(
+        "--s3-prefix",
+        type=str,
+        default="grobid/",
+        help="key prefix for items created in bucket",
+    )
+    parser.add_argument(
+        "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+    )
+    parser.add_argument(
+        "--s3-storage-class",
+        type=str,
+        default="STANDARD",
+        help="AWS S3 storage class (redundancy) to use",
+    )
+    parser.add_argument(
+        "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+    )
     args = parser.parse_args()
 
+    sentry_sdk.init()
+
     worker = DeliverDumpGrobidS3(**args.__dict__)
     worker.run(args.dump_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__":  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..093f32a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,160 +7,191 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
 # in `wayback` library. Means we can't run pylint.
 # pylint: skip-file
 
-import os
-import sys
-import json
+import argparse
 import base64
 import hashlib
-import argparse
+import json
+import os
+import sys
 from collections import Counter
+from http.client import IncompleteRead
 
-import raven
+import sentry_sdk
 import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
 from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
 
 
 class DeliverGwbDisk:
-
     def __init__(self, disk_dir, **kwargs):
-        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+        self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
         self.rstore = None
         self.count = Counter()
         # /serve/ instead of /download/ doesn't record view count
-        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+        self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get(
+            "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+        )
         self.disk_dir = disk_dir
-        self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
-        self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+        self.disk_prefix = kwargs.get("disk_prefix", "pdf/")
+        self.disk_suffix = kwargs.get("disk_suffix", ".pdf")
 
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory(
+                    webdata_secret=self.petabox_webdata_secret,
+                    download_base_url=self.petabox_base_url,
+                )
+            )
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+            )
         except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+                    ve
+                ),
+            )
         except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+                    eofe
+                ),
+            )
         except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+                    te
+                ),
+            )
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
-            return None, dict(status="error",
+            return None, dict(
+                status="error",
                 reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
+                warc_status=gwb_record.get_status()[0],
+            )
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+            return None, dict(
+                status="error",
+                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+                    ire
+                ),
+            )
         return raw_content, None
 
     def run(self, manifest_file):
         sys.stderr.write("Ensuring all 65536 base directories exist...\n")
         for i in range(256):
             for j in range(256):
-                fpath = "{}/{}{:02x}/{:02x}".format(
-                        self.disk_dir,
-                        self.disk_prefix,
-                        i,
-                        j)
+                fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
                 os.makedirs(fpath, exist_ok=True)
         sys.stderr.write("Starting...\n")
         for line in manifest_file:
-            self.count['total'] += 1
-            line = line.strip().split('\t')
+            self.count["total"] += 1
+            line = line.strip().split("\t")
             if len(line) != 2:
-                self.count['skip-line'] += 1
+                self.count["skip-line"] += 1
                 continue
             sha1_hex, cdx_json = line[0], line[1]
             assert len(sha1_hex) == 40
             file_cdx = json.loads(cdx_json)
             # If warc is not item/file.(w)arc.gz form, skip it
-            if len(file_cdx['warc'].split('/')) != 2:
-                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
-                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
-                self.count['skip-warc'] += 1
+            if len(file_cdx["warc"].split("/")) != 2:
+                sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+                self.count["skip-warc"] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(
+                file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+            )
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
-                self.count['err-petabox-fetch'] += 1
+                print(
+                    "{}\terror petabox\t{}\t{}".format(
+                        sha1_hex, file_cdx["warc"], status["reason"]
+                    )
+                )
+                self.count["err-petabox-fetch"] += 1
                 continue
             elif not blob:
                 print("{}\tskip-empty-blob".format(sha1_hex))
-                self.count['skip-empty-blob'] += 1
+                self.count["skip-empty-blob"] += 1
                 continue
             # verify sha1
             if sha1_hex != hashlib.sha1(blob).hexdigest():
-                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
-                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+                # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+                # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
                 print("{}\terror petabox-hash-mismatch".format(sha1_hex))
-                self.count['err-petabox-hash-mismatch'] += 1
+                self.count["err-petabox-hash-mismatch"] += 1
 
-            self.count['petabox-ok'] += 1
+            self.count["petabox-ok"] += 1
             # save to disk
             fpath = "{}/{}{}/{}/{}{}".format(
-                    self.disk_dir,
-                    self.disk_prefix,
-                    sha1_hex[0:2],
-                    sha1_hex[2:4],
-                    sha1_hex,
-                    self.disk_suffix)
-            with open(fpath, 'wb') as f:
+                self.disk_dir,
+                self.disk_prefix,
+                sha1_hex[0:2],
+                sha1_hex[2:4],
+                sha1_hex,
+                self.disk_suffix,
+            )
+            with open(fpath, "wb") as f:
                 f.write(blob)
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
-            self.count['success-disk'] += 1
+            self.count["success-disk"] += 1
         sys.stderr.write("{}\n".format(self.count))
 
-@sentry_client.capture_exceptions
+
 def main():
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--disk-dir',
-                        required=True,
-                        type=str,
-                        help='local base directory to save into')
-    parser.add_argument('--disk-prefix',
-                        type=str,
-                        default="pdf/",
-                        help='directory prefix for items created in bucket')
-    parser.add_argument('--disk-suffix',
-                        type=str,
-                        default=".pdf",
-                        help='file suffix for created files')
-    parser.add_argument('--warc-uri-prefix',
-                        type=str,
-                        default='https://archive.org/serve/',
-                        help='URI where WARCs can be found')
-    parser.add_argument('manifest_file',
-                        help="TSV/JSON manifest file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "--disk-dir", required=True, type=str, help="local base directory to save into"
+    )
+    parser.add_argument(
+        "--disk-prefix",
+        type=str,
+        default="pdf/",
+        help="directory prefix for items created in bucket",
+    )
+    parser.add_argument(
+        "--disk-suffix", type=str, default=".pdf", help="file suffix for created files"
+    )
+    parser.add_argument(
+        "--warc-uri-prefix",
+        type=str,
+        default="https://archive.org/serve/",
+        help="URI where WARCs can be found",
+    )
+    parser.add_argument(
+        "manifest_file",
+        help="TSV/JSON manifest file",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
     args = parser.parse_args()
 
+    sentry_sdk.init()
+
     worker = DeliverGwbDisk(**args.__dict__)
     worker.run(args.manifest_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__":  # pragma: no cover
     main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..6f37ede 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -24,7 +24,7 @@ Output:
 - log to stdout (redirect to file), prefixed by sha1
 
 Requires:
-- raven (sentry)
+- sentry-sdk
 - boto3 (AWS S3 client library)
 - wayback/GWB libraries
 """
@@ -33,152 +33,180 @@ Requires:
 # in `wayback` library. Means we can't run pylint.
 # pylint: skip-file
 
-import os
-import sys
-import json
+import argparse
 import base64
 import hashlib
-import argparse
+import json
+import os
+import sys
 from collections import Counter
+from http.client import IncompleteRead
 
 import boto3
-import raven
+import sentry_sdk
 import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
 from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
 
 
 class DeliverGwbS3:
-
     def __init__(self, s3_bucket, **kwargs):
-        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+        self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
         self.rstore = None
         self.count = Counter()
         # /serve/ instead of /download/ doesn't record view count
-        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+        self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
         # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.petabox_webdata_secret = kwargs.get(
+            "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+        )
         self.s3_bucket = s3_bucket
-        self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
-        self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
-        self.s3 = boto3.resource('s3')
+        self.s3_prefix = kwargs.get("s3_prefix", "pdf/")
+        self.s3_suffix = kwargs.get("s3_suffix", ".pdf")
+        self.s3 = boto3.resource("s3")
         self.bucket = self.s3.Bucket(self.s3_bucket)
 
     def fetch_warc_content(self, warc_path, offset, c_size):
         warc_uri = self.warc_uri_prefix + warc_path
         if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
+            self.rstore = ResourceStore(
+                loaderfactory=CDXLoaderFactory(
+                    webdata_secret=self.petabox_webdata_secret,
+                    download_base_url=self.petabox_base_url,
+                )
+            )
         try:
             gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
         except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+            )
         except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+                    ve
+                ),
+            )
         except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+                    eofe
+                ),
+            )
         except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+            return None, dict(
+                status="error",
+                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+                    te
+                ),
+            )
         # Note: could consider a generic "except Exception" here, as we get so
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
         if gwb_record.get_status()[0] != 200:
-            return None, dict(status="error",
+            return None, dict(
+                status="error",
                 reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
+                warc_status=gwb_record.get_status()[0],
+            )
 
         try:
             raw_content = gwb_record.open_raw_content().read()
         except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+            return None, dict(
+                status="error",
+                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+                    ire
+                ),
+            )
         return raw_content, None
 
     def run(self, manifest_file):
         sys.stderr.write("Starting...\n")
         for line in manifest_file:
-            self.count['total'] += 1
-            line = line.strip().split('\t')
+            self.count["total"] += 1
+            line = line.strip().split("\t")
             if len(line) != 2:
-                self.count['skip-line'] += 1
+                self.count["skip-line"] += 1
                 continue
             sha1_hex, cdx_json = line[0], line[1]
             assert len(sha1_hex) == 40
             file_cdx = json.loads(cdx_json)
             # If warc is not item/file.(w)arc.gz form, skip it
-            if len(file_cdx['warc'].split('/')) != 2:
-                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
-                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
-                self.count['skip-warc'] += 1
+            if len(file_cdx["warc"].split("/")) != 2:
+                sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+                self.count["skip-warc"] += 1
                 continue
             # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            blob, status = self.fetch_warc_content(
+                file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+            )
             if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
-                self.count['err-petabox-fetch'] += 1
+                print(
+                    "{}\terror petabox\t{}\t{}".format(
+                        sha1_hex, file_cdx["warc"], status["reason"]
+                    )
+                )
+                self.count["err-petabox-fetch"] += 1
                 continue
             elif not blob:
                 print("{}\tskip-empty-blob".format(sha1_hex))
-                self.count['skip-empty-blob'] += 1
+                self.count["skip-empty-blob"] += 1
                 continue
             # verify sha1
             if sha1_hex != hashlib.sha1(blob).hexdigest():
-                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
-                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+                # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+                # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
                 print("{}\terror petabox-hash-mismatch".format(sha1_hex))
-                self.count['err-petabox-hash-mismatch'] += 1
+                self.count["err-petabox-hash-mismatch"] += 1
 
-            self.count['petabox-ok'] += 1
+            self.count["petabox-ok"] += 1
             # upload to AWS S3
             obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
-                Body=blob)
+                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
+                Body=blob,
+            )
             print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
-            self.count['success-s3'] += 1
+            self.count["success-s3"] += 1
         sys.stderr.write("{}\n".format(self.count))
 
-@sentry_client.capture_exceptions
+
 def main():
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--s3-bucket',
-                        required=True,
-                        type=str,
-                        help='AWS S3 bucket to upload into')
-    parser.add_argument('--s3-prefix',
-                        type=str,
-                        default="pdf/",
-                        help='key prefix for items created in bucket')
-    parser.add_argument('--s3-suffix',
-                        type=str,
-                        default=".pdf",
-                        help='file suffix for created objects')
-    parser.add_argument('--warc-uri-prefix',
-                        type=str,
-                        default='https://archive.org/serve/',
-                        help='URI where WARCs can be found')
-    parser.add_argument('manifest_file',
-                        help="TSV/JSON manifest file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
+    parser.add_argument(
+        "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+    )
+    parser.add_argument(
+        "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket"
+    )
+    parser.add_argument(
+        "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects"
+    )
+    parser.add_argument(
+        "--warc-uri-prefix",
+        type=str,
+        default="https://archive.org/serve/",
+        help="URI where WARCs can be found",
+    )
+    parser.add_argument(
+        "manifest_file",
+        help="TSV/JSON manifest file",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
     args = parser.parse_args()
 
+    sentry_sdk.init()
+
     worker = DeliverGwbS3(**args.__dict__)
     worker.run(args.manifest_file)
 
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__":  # pragma: no cover
     main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
new file mode 100755
index 0000000..aef5c12
--- /dev/null
+++ b/python/scripts/doaj2ingestrequest.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Transform an DOAJ article dump (JSON) into ingest requests.
+
+TODO: should we also attempt PDF ingest for HTML links? They seem to often be
+landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
+in the HTML headers and adds an ingest request on that basis. Or even just run
+the re-ingest in-process and publish a second result.
+"""
+
+import argparse
+import json
+import sys
+from typing import List, Optional
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+    # large OA publishers (we get via DOI)
+    # large repos and aggregators (we crawl directly)
+    "://arxiv.org/",
+    "://europepmc.org/",
+    "ncbi.nlm.nih.gov/",
+    # "semanticscholar.org/",
+    "://doi.org/",
+    "://dx.doi.org/",
+    "zenodo.org/",
+    "figshare.com/",
+    "://archive.org/",
+    ".archive.org/",
+    # large publishers/platforms; may remove in the future
+    # "://link.springer.com/",
+    # "://dergipark.gov.tr/",
+    # "frontiersin.org/",
+    # "scielo",
+]
+
+# these default to PDF; note that we also do pdf ingests for HTML pages
+CONTENT_TYPE_MAP = {
+    "abstract": [],
+    "doc": [],
+    "": ["pdf"],
+    "doi": ["pdf"],
+    "url": ["pdf"],
+    "fulltext": ["pdf"],
+    "anySimpleType": ["pdf"],
+    "application/pdf": ["pdf"],
+    "html": ["html", "pdf"],
+    "text/html": ["html", "pdf"],
+    "xml": ["xml"],
+}
+
+
+def canon(s: str) -> str:
+    parsed = urlcanon.parse_url(s)
+    return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj: dict) -> List[dict]:
+    """
+    Transforms from a single DOAJ object to zero or more ingest requests.
+    Returns a list of dicts.
+    """
+
+    doaj_id = obj["id"].lower()
+    assert doaj_id
+
+    bibjson = obj["bibjson"]
+    if not bibjson["link"]:
+        return []
+
+    requests = []
+
+    doi: Optional[str] = None
+    for ident in bibjson["identifier"] or []:
+        if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+            doi = ident["id"].lower()
+
+    for link in bibjson["link"] or []:
+        if link.get("type") != "fulltext" or not link.get("url"):
+            continue
+        ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
+        if not ingest_types:
+            continue
+
+        skip = False
+        for domain in DOMAIN_BLOCKLIST:
+            if domain in link["url"].lower():
+                skip = True
+        if skip:
+            continue
+        try:
+            base_url = canon(link["url"].strip())
+        except UnicodeEncodeError:
+            continue
+
+        if not base_url or len(base_url) > 1000:
+            continue
+
+        for ingest_type in ingest_types:
+            request = {
+                "base_url": base_url,
+                "ingest_type": ingest_type,
+                "link_source": "doaj",
+                "link_source_id": doaj_id,
+                "ingest_request_source": "doaj",
+                "release_stage": "published",
+                "rel": "publisher",
+                "ext_ids": {
+                    "doi": doi,
+                    "doaj": doaj_id,
+                },
+                "edit_extra": {},
+            }
+            requests.append(request)
+
+    return requests
+
+
+def run(args) -> None:
+    for l in args.json_file:
+        if not l.strip():
+            continue
+        row = json.loads(l)
+
+        requests = transform(row) or []
+        for r in requests:
+            print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+    )
+    subparsers = parser.add_subparsers()
+
+    args = parser.parse_args()
+
+    run(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..44c091c 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,29 +17,32 @@ And outputs JSON objects that are can be imported into fatcat with the
 No dependencies (only python3 stdlib)
 """
 
-import sys
-import json
 import base64
+import json
+import sys
+
 
 def run():
     for line in sys.stdin:
-        line = line.split('\t')
+        line = line.split("\t")
         assert len(line) == 5
-        raw_sha1 = line[0].replace('sha1:', '')
+        raw_sha1 = line[0].replace("sha1:", "")
         dois = json.loads(line[1])
         cdx = json.loads(line[2])
         mimetype = line[3]
         size = int(line[4])
 
-        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
 
         obj = dict(
             sha1=sha1,
             dois=dois,
-            cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+            cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
             size=size,
-            mimetype=mimetype)
+            mimetype=mimetype,
+        )
         print(json.dumps(obj))
 
-if __name__=='__main__':
+
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/fetch_cdx_sha1hex.py b/python/scripts/fetch_cdx_sha1hex.py
new file mode 100755
index 0000000..2eb56cb
--- /dev/null
+++ b/python/scripts/fetch_cdx_sha1hex.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+"""
+This is a helper script to take fatcat file entities with partial metadata (eg,
+missing SHA256) and try to find one or more CDX record where the file may be
+found in wayback.
+
+This script uses the sandcrawler library and should be run like:
+
+    head file_export.json | python -m scripts.fetch_cdx_sha1hex > results.json
+"""
+
+import base64
+import json
+import sys
+from typing import List, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
+
+from sandcrawler.ia import CdxApiClient, cdx_to_dict
+
+
+def requests_retry_session(
+    retries: int = 10,
+    backoff_factor: int = 3,
+    status_forcelist: List[int] = [500, 502, 504],
+    session: requests.Session = None,
+) -> requests.Session:
+    """
+    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+    """
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+
+def b32_hex(s: str) -> str:
+    """
+    Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+    base32 checksums are used by, eg, heritrix and in wayback CDX files
+    """
+    s = s.strip().split()[0].lower()
+    if s.startswith("sha1:"):
+        s = s[5:]
+    if len(s) != 32:
+        if len(s) == 40:
+            return s
+        raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+
+def get_db_cdx(sha1hex: str, http_session) -> List[dict]:
+    resp = http_session.get(
+        SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(sha1hex="eq." + sha1hex)
+    )
+    resp.raise_for_status()
+    rows = resp.json()
+    return rows or []
+
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+
+def get_api_cdx(url: str, sha1hex: str, cdx_api) -> Optional[dict]:
+
+    params = {
+        "url": url,
+        "output": "json",
+        "matchType": "exact",
+        "limit": 20,
+        # TODO: group-by digest/checksum?
+        # can't filter status because might be warc/revisit
+        # "filter": "statuscode:200",
+    }
+    rows = cdx_api._query_api(params)
+    if not rows:
+        return None
+    for row in rows:
+        if row.sha1hex == sha1hex:
+            return row
+    return None
+
+
+def process_file(fe, session, cdx_api) -> dict:
+    status = "unknown"
+
+    # simple CDX db lookup first
+    cdx_row_list = get_db_cdx(fe["sha1"], http_session=session)
+    if cdx_row_list:
+        return dict(
+            file_entity=fe,
+            cdx_rows=cdx_row_list,
+            status="success-db",
+        )
+
+    original_urls = []
+    for pair in fe["urls"]:
+        u = pair["url"]
+        if not "://web.archive.org/web/" in u:
+            continue
+        seg = u.split("/")
+        assert seg[2] == "web.archive.org"
+        assert seg[3] == "web"
+        if not seg[4].isdigit():
+            continue
+        original_url = "/".join(seg[5:])
+        original_urls.append(original_url)
+
+    if len(original_urls) == 0:
+        return dict(file_entity=fe, status="skip-no-urls")
+
+    found_cdx_rows = []
+    for url in list(set(original_urls)):
+
+        cdx_record = None
+        try:
+            cdx_record = get_api_cdx(original_url, sha1hex=fe["sha1"], cdx_api=cdx_api)
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 403:
+                return dict(file_entity=fe, status="fail-cdx-403")
+            else:
+                raise
+        if cdx_record and cdx_record.sha1hex == fe["sha1"]:
+            found_cdx_rows.append(cdx_to_dict(cdx_record))
+
+    if found_cdx_rows:
+        return dict(
+            file_entity=fe,
+            cdx_rows=found_cdx_rows,
+            status="success-api",
+        )
+
+    return dict(
+        file_entity=fe,
+        status="fail-not-found",
+    )
+
+
+def main():
+    session = requests_retry_session()
+    session.headers.update(
+        {
+            "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+        }
+    )
+    cdx_api = CdxApiClient()
+    for line in sys.stdin:
+        if not line.strip():
+            continue
+        fe = json.loads(line)
+        print(json.dumps(process_file(fe, session=session, cdx_api=cdx_api)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index c33ab86..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,43 +1,48 @@
 #!/usr/bin/env python3
 
-import sys
 import json
+import sys
 
-with open('title_slug_blacklist.txt', 'r') as f:
-    TITLE_BLACKLIST = [l.strip() for l in f]
-
-TITLE_BLACKLIST.extend((
-    'editorial',
-    'advertisement',
-    'bookreviews',
-    'reviews',
-    'nr',
-    'abstractoriginalarticle',
-    'originalarticle',
-    'impactfactor',
-    'articlenumber',
-))
+with open("title_slug_denylist.txt", "r") as f:
+    TITLE_DENYLIST = [l.strip() for l in f]
+
+TITLE_DENYLIST.extend(
+    (
+        "editorial",
+        "advertisement",
+        "bookreviews",
+        "reviews",
+        "nr",
+        "abstractoriginalarticle",
+        "originalarticle",
+        "impactfactor",
+        "articlenumber",
+    )
+)
 
 # The full name can't *entirely* be one of these
-NAME_BLACKLIST = (
-    'phd',
-    'phdstudent',
+NAME_DENYLIST = (
+    "phd",
+    "phdstudent",
 )
 
+
 def tokenize(s, remove_whitespace=True):
 
-    s.replace('&apos;', "'")
+    s.replace("&apos;", "'")
     # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+    s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
 
     if remove_whitespace:
-        s = ''.join(s.split())
+        s = "".join(s.split())
 
     # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+    return s.encode("ascii", "replace").decode("utf8").replace("?", "")
+
 
 assert tokenize("Impact Factor: 2.114") == "impactfactor"
-assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
+assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
 
 def filter_title(title):
 
@@ -45,16 +50,16 @@ def filter_title(title):
     if len(title) > 500:
         return None
     title_slug = tokenize(title, remove_whitespace=True)
-    if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
+    if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
         return None
-    if title_slug.startswith('nr'):
+    if title_slug.startswith("nr"):
         return None
-    if title.lower().replace('.', '').startswith('int j '):
+    if title.lower().replace(".", "").startswith("int j "):
         return None
 
     for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
         if title.startswith(prefix):
-            title.replace(prefix, '')
+            title.replace(prefix, "")
 
     if title.startswith("The Journal of "):
         return None
@@ -78,63 +83,84 @@ def filter_title(title):
         return None
 
     # too deep subtitling/splitting
-    if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+    if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
         return None
 
     return title
 
+
 def filter_author_name(name):
-    name = name['name']
-    if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
+    name = name["name"]
+    if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
         return None
-    return ' '.join([t for t in name.split() if tokenize(t)])
+    return " ".join([t for t in name.split() if tokenize(t)])
+
 
 def filter_authors(l):
     return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
 
+
 def filter_refs(l):
     # TODO:
     return l
 
+
 def filter_journal_name(name):
-    # same blacklist, for now
+    # same denylist, for now
     if not name:
         return None
-    name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+    name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
     slug_name = tokenize(name)
-    if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
-        return None
-    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+    if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
+        return None
+    for prefix in (
+        "/ ",
+        "~ ",
+        "& ",
+        "© ",
+        "Original Research Article ",
+        "Original Article ",
+        "Research Article ",
+        "Available online www.jocpr.com ",
+    ):
         if name.startswith(prefix):
-            name = name.replace(prefix, '')
-    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+            name = name.replace(prefix, "")
+    for suffix in (
+        " Available online at www.sciarena.com",
+        " Original Article",
+        " Available online at",
+        " ISSN",
+        " ISSUE",
+    ):
         if name.endswith(suffix):
-            name = name.replace(suffix, '')
+            name = name.replace(suffix, "")
     if "====================" in name:
         return None
     if len(name) > 150:
         return None
-    return ' '.join(name.split())
+    return " ".join(name.split())
+
 
 def filter_metadata(obj):
-    if not (obj.get('title') and obj.get('authors')):
+    if not (obj.get("title") and obj.get("authors")):
         return None
 
-    title = filter_title(obj['title'])
+    title = filter_title(obj["title"])
     if not title:
-        #sys.stderr.write("bad title\n")
+        # sys.stderr.write("bad title\n")
         return None
     else:
-        obj['title'] = title
-    obj['authors'] = filter_authors(obj['authors'])
-    obj['citations'] = filter_refs(obj['citations'])
-    obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+        obj["title"] = title
+    obj["authors"] = filter_authors(obj["authors"])
+    obj["citations"] = filter_refs(obj["citations"])
+    obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
 
     return obj
 
+
 def run(invert=False):
     for line in sys.stdin:
-        fields = line.split('\t')
+        fields = line.split("\t")
         if len(fields) == 5:
             raw = fields[4]
         elif len(fields) == 1:
@@ -151,9 +177,10 @@ def run(invert=False):
                     fields[4] = processed
                 else:
                     fields[0] = processed
-                print('\t'.join(fields))
+                print("\t".join(fields))
         elif invert:
             print(raw.strip())
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
 - dates differ (not just year)
 """
 
-import sys
 import json
+import sys
 
 # out of 1000
 SCORE_THRESHOLD = 900
@@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50
 
 REQUIRE_AUTHORS = False
 
+
 def tokenize(s, remove_whitespace=False):
 
-    s.replace('&apos;', "'")
+    s.replace("&apos;", "'")
     # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+    s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
 
     if remove_whitespace:
-        s = ''.join(s.split())
+        s = "".join(s.split())
 
     # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').replace(b'?', b'')
+    return s.encode("ascii", "replace").replace(b"?", b"")
+
 
 def check_authors(left, right):
     """
@@ -51,7 +53,7 @@ def check_authors(left, right):
         return False
     right_all = tokenize(" ".join(right))
     for i in range(len(left)):
-        l = left[i].lower().replace('jr.', '').split()
+        l = left[i].lower().replace("jr.", "").split()
         if not l:
             return False
         l = tokenize(l[-1])
@@ -59,20 +61,22 @@ def check_authors(left, right):
             # weird author name (single char)
             return False
         if l not in right_all:
-            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+            # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
             return False
     return True
 
+
 def test_check_authors():
     assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
-    assert not check_authors([], ['one'])
-    assert check_authors(['one'], ['one'])
-    assert check_authors(['one two'], ['One Two'])
-    assert check_authors(['two'], ['One Two'])
-    assert check_authors(['two'], ['two, one'])
-    assert check_authors(['mago'], ['Mr. Magoo'])
-    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+    assert not check_authors([], ["one"])
+    assert check_authors(["one"], ["one"])
+    assert check_authors(["one two"], ["One Two"])
+    assert check_authors(["two"], ["One Two"])
+    assert check_authors(["two"], ["two, one"])
+    assert check_authors(["mago"], ["Mr. Magoo"])
+    assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+    assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
 
 # Rows are (score, left, right)
 def process_group(rows):
@@ -86,10 +90,10 @@ def process_group(rows):
         left = json.loads(row[1])
         right = json.loads(row[2])
         # authors must roughly match
-        if not check_authors(left['authors'], right['authors']):
+        if not check_authors(left["authors"], right["authors"]):
             continue
         # years must match (if defined)
-        if left['year'] and right['year'] and left['year'] != right['year']:
+        if left["year"] and right["year"] and left["year"] != right["year"]:
             continue
         filtered.append((left, right))
 
@@ -101,8 +105,8 @@ def process_group(rows):
     group_ids = set()
     for row in filtered[1:]:
         (left, right) = row
-        l_id = left['fatcat_release']
-        r_id = right['fatcat_release']
+        l_id = left["fatcat_release"]
+        r_id = right["fatcat_release"]
         releases[l_id] = left
         releases[r_id] = right
         if not group_ids:
@@ -119,6 +123,7 @@ def process_group(rows):
 
     print(json.dumps([releases[ident] for ident in group_ids]))
 
+
 def run():
 
     last_slug = None
@@ -126,7 +131,7 @@ def run():
 
     # group lines by slug, and process in batches
     for line in sys.stdin:
-        line = line.strip().split('\t')
+        line = line.strip().split("\t")
         assert len(line) == 4
         slug = line[0]
         if last_slug and slug != last_slug and lines:
@@ -140,5 +145,6 @@ def run():
     if lines:
         process_group(lines)
 
-if __name__=='__main__':
+
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..c5b7eef 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
 No dependencies (only python3 stdlib)
 """
 
-import sys
 import json
+import sys
 
 # out of 1000
 score_threshold = 900
@@ -23,15 +23,16 @@ require_authors = 1
 
 def tokenize(s, remove_whitespace=False):
 
-    s.replace('&apos;', "'")
+    s.replace("&apos;", "'")
     # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+    s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
 
     if remove_whitespace:
-        s = ''.join(s.split())
+        s = "".join(s.split())
 
     # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').replace(b'?', b'')
+    return s.encode("ascii", "replace").replace(b"?", b"")
+
 
 def check_authors(left, right):
     """
@@ -44,7 +45,7 @@ def check_authors(left, right):
         return False
     right_all = tokenize(" ".join(right))
     for i in range(len(left)):
-        l = left[i].lower().replace('jr.', '').split()
+        l = left[i].lower().replace("jr.", "").split()
         if not l:
             return False
         l = tokenize(l[-1])
@@ -52,20 +53,22 @@ def check_authors(left, right):
             # weird author name (single char)
             return False
         if l not in right_all:
-            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+            # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
             return False
     return True
 
+
 def test_check_authors():
     assert not check_authors([], [])
-    assert not check_authors([], ['one'])
-    assert check_authors(['one'], ['one'])
-    assert check_authors(['one two'], ['One Two'])
-    assert check_authors(['two'], ['One Two'])
-    assert check_authors(['two'], ['two, one'])
-    assert check_authors(['mago'], ['Mr. Magoo'])
-    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+    assert not check_authors([], ["one"])
+    assert check_authors(["one"], ["one"])
+    assert check_authors(["one two"], ["One Two"])
+    assert check_authors(["two"], ["One Two"])
+    assert check_authors(["two"], ["two, one"])
+    assert check_authors(["mago"], ["Mr. Magoo"])
+    assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+    assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
 
 # Rows are (score, grobid, crossref)
 def process_group(rows):
@@ -78,20 +81,21 @@ def process_group(rows):
             continue
         grobid = json.loads(row[1])
         crossref = json.loads(row[2])
-        if not check_authors(crossref['authors'], grobid['authors']):
-            #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+        if not check_authors(crossref["authors"], grobid["authors"]):
+            # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
             continue
         else:
-            #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+            # print("YES: {} {}".format(crossref['authors'], grobid['authors']))
             pass
-        sha1 = grobid['sha1']
-        doi = crossref['doi'].lower()
+        sha1 = grobid["sha1"]
+        doi = crossref["doi"].lower()
         l = keepers.get(sha1, list())
         l.append(doi)
         keepers[sha1] = l
     for sha1, doi_list in keepers.items():
         print("{}\t{}".format(sha1, json.dumps(doi_list)))
 
+
 def run():
 
     last_slug = None
@@ -99,7 +103,7 @@ def run():
 
     # group lines by slug, and process in batches
     for line in sys.stdin:
-        line = line.strip().split('\t')
+        line = line.strip().split("\t")
         assert len(line) == 4
         slug = line[0]
         if last_slug and slug != last_slug and lines:
@@ -112,5 +116,6 @@ def run():
     if lines:
         process_group(lines)
 
-if __name__=='__main__':
+
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..90a0f77 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
 output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -10,43 +9,49 @@ Run in bulk like:
     ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
 """
 
-import sys
 import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
 
-from grobid2json import teixml2json
 
 def parse_hbase(line):
-    line = line.split('\t')
+    line = line.split("\t")
     assert len(line) == 2
     sha1hex = line[0]
     obj = json.loads(line[1])
-    tei_xml = obj['tei_xml']
+    tei_xml = obj["tei_xml"]
     return sha1hex, tei_xml
 
+
 def parse_pg(line):
     obj = json.loads(line)
-    return obj['sha1hex'], obj['tei_xml']
+    return obj["sha1hex"], obj["tei_xml"]
+
 
-def run(mode='hbase'):
+def run(mode="hbase"):
     for line in sys.stdin:
-        if mode == 'hbase':
+        if mode == "hbase":
             sha1hex, tei_xml = parse_hbase(line)
-        elif mode == 'pg':
+        elif mode == "pg":
             sha1hex, tei_xml = parse_pg(line)
         else:
-            raise NotImplementedError('parse mode: {}'.format(mode))
+            raise NotImplementedError("parse mode: {}".format(mode))
 
-        obj = teixml2json(tei_xml, encumbered=False)
+        tei_doc = parse_document_xml(tei_xml)
+        tei_doc.remove_encumbered()
+        obj = tei_doc.to_legacy_dict()
 
         affiliations = []
-        for author in obj['authors']:
-            if author.get('affiliation'):
-                affiliations.append(author['affiliation'])
+        for author in obj["authors"]:
+            if author.get("affiliation"):
+                affiliations.append(author["affiliation"])
         if affiliations:
             # don't duplicate affiliations; only the unique ones
             affiliations = list(set([json.dumps(a) for a in affiliations]))
             affiliations = [json.loads(a) for a in affiliations]
-            print('\t'.join([sha1hex, json.dumps(affiliations)]))
+            print("\t".join([sha1hex, json.dumps(affiliations)]))
+
 
-if __name__=='__main__':
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 3d2e14c..f941881 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,69 +1,67 @@
 #!/usr/bin/env python3
 
-import sys
-import json
 import datetime
+import json
+import sys
+
+MAX_ABSTRACT_BYTES = 4096
 
-MAX_ABSTRACT_BYTES=4096
 
 def parse_grobid_json(obj):
 
-    if not obj.get('title'):
+    if not obj.get("title"):
         return None
 
     extra = dict()
 
-    if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
-        abobj = dict(
-            mimetype="text/plain",
-            language=None,
-            content=obj.get('abstract').strip())
+    if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
+        abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
         abstracts = [abobj]
     else:
         abstracts = None
 
     contribs = []
-    for a in obj.get('authors', []):
+    for a in obj.get("authors", []):
         c = dict(raw_name=a, role="author")
         contribs.append(c)
 
     refs = []
-    for raw in obj.get('citations', []):
+    for raw in obj.get("citations", []):
         extra = dict()
         ref = dict()
-        ref['key'] = raw.get('id')
-        if raw.get('title'):
-            ref['title'] = raw['title'].strip()
-        if raw.get('date'):
+        ref["key"] = raw.get("id")
+        if raw.get("title"):
+            ref["title"] = raw["title"].strip()
+        if raw.get("date"):
             try:
-                year = int(raw['date'].strip()[:4])
-                ref['year'] = year
+                year = int(raw["date"].strip()[:4])
+                ref["year"] = year
             except:
                 pass
-        for key in ('volume', 'url', 'issue', 'publisher'):
+        for key in ("volume", "url", "issue", "publisher"):
             if raw.get(key):
                 extra[key] = raw[key].strip()
-        if raw.get('authors'):
-            extra['authors'] = [a['name'] for a in raw['authors']]
+        if raw.get("authors"):
+            extra["authors"] = [a["name"] for a in raw["authors"]]
         if extra:
             extra = dict(grobid=extra)
         else:
             extra = None
-        ref['extra'] = extra
+        ref["extra"] = extra
         refs.append(ref)
 
     release_type = "journal-article"
     release_date = None
-    if obj.get('date'):
+    if obj.get("date"):
         # TODO: only returns year, ever? how to handle?
-        release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+        release_date = datetime.datetime(year=obj["date"], month=1, day=1)
 
-    if obj.get('doi'):
-        extra['doi'] = obj['doi']
-    if obj['journal'].get('name'):
-        extra['container_name'] = obj['journal']['name']
+    if obj.get("doi"):
+        extra["doi"] = obj["doi"].lower()
+    if obj["journal"].get("name"):
+        extra["container_name"] = obj["journal"]["name"]
 
-    extra['is_longtail_oa'] = True
+    extra["is_longtail_oa"] = True
 
     # TODO: ISSN/eISSN handling? or just journal name lookup?
 
@@ -73,15 +71,17 @@ def parse_grobid_json(obj):
         extra = None
 
     return dict(
-        title=obj['title'].strip(),
+        title=obj["title"].strip(),
         contribs=contribs,
-        publisher=obj['journal'].get('publisher'),
-        volume=obj['journal'].get('volume'),
-        issue=obj['journal'].get('issue'),
+        publisher=obj["journal"].get("publisher"),
+        volume=obj["journal"].get("volume"),
+        issue=obj["journal"].get("issue"),
         abstracts=abstracts,
         release_type=release_type,
         release_date=release_date,
-        extra=extra)
+        extra=extra,
+    )
+
 
 def run():
     for line in sys.stdin:
@@ -90,5 +90,6 @@ def run():
         if out:
             print(out)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 494ec7a..8a353ca 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 This script is used to turn ingest request postgres rows (in JSON export
 format) back in to regular ingest request JSON.
@@ -7,24 +6,25 @@ format) back in to regular ingest request JSON.
 The only difference is the name and location of some optional keys.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
 
 
 def transform(row):
     """
     dict-to-dict
     """
-    row.pop('created', None)
-    extra = row.pop('request', None) or {}
-    for k in ('ext_ids', 'edit_extra'):
+    row.pop("created", None)
+    extra = row.pop("request", None) or {}
+    for k in ("ext_ids", "edit_extra"):
         if k in extra:
             row[k] = extra[k]
-    if 'release_ident' in extra:
-        row['fatcat'] = dict(release_ident=extra['release_ident'])
+    if "release_ident" in extra:
+        row["fatcat"] = dict(release_ident=extra["release_ident"])
     return row
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -33,19 +33,27 @@ def run(args):
             req = transform(json.loads(l))
         except:
             print(l, file=sys.stderr)
+        if args.force_recrawl:
+            req["force_recrawl"] = True
         print(json.dumps(req, sort_keys=True))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-        help="arabesque output file to use",
-        type=argparse.FileType('r'))
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
+    )
+    parser.add_argument(
+        "--force-recrawl",
+        action="store_true",
+        help="whether to add recrawl (SPNv2) flag to request",
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
 to JSON format for fast fatcat importing.
 """
 
-import sys
 import json
 import sqlite3
+import sys
 
 # iterate over rows in files metadata...
 # 1. select all identified DOIs
@@ -20,6 +20,7 @@ import sqlite3
 # 2. select all file metadata
 # 3. output object
 
+
 def or_none(s):
     if s is None:
         return None
@@ -27,6 +28,7 @@ def or_none(s):
         return None
     return s
 
+
 def process_db(db_path):
 
     db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
         dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
         print(json.dumps(obj))
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,19 +1,18 @@
 #!/usr/bin/env python3
-
 """
 Transform an OAI-PMH bulk dump (JSON) into ingest requests.
 
 Eg: https://archive.org/details/oai_harvest_20200215
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+
 import urlcanon
 
 DOMAIN_BLOCKLIST = [
     # large OA publishers (we get via DOI)
-
     # large repos and aggregators (we crawl directly)
     "://arxiv.org/",
     "://europepmc.org/",
@@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [
     "://archive.org/",
     ".archive.org/",
     "://127.0.0.1/",
-
+    "://www.kb.dk/",
+    "://kb-images.kb.dk/",
+    "://mdz-nbn-resolving.de/",
+    "://aggr.ukm.um.si/",
+    "://edoc.mpg.de/",
+    "doaj.org/",
+    "orcid.org/",
+    "://gateway.isiknowledge.com/",
     # OAI specific additions
     "://hdl.handle.net/",
 ]
 
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+    "oai:kb.dk:",
+    "oai:bdr.oai.bsb-muenchen.de:",
+    "oai:hispana.mcu.es:",
+    "oai:bnf.fr:",
+    "oai:ukm.si:",
+    "oai:biodiversitylibrary.org:",
+    "oai:hsp.org:",
+    "oai:repec:",
+    "oai:n/a:",
+    "oai:quod.lib.umich.edu:",
+    "oai:americanae.aecid.es:",
+    "oai:www.irgrid.ac.cn:",
+    "oai:espace.library.uq.edu:",
+    "oai:edoc.mpg.de:",
+    "oai:bibliotecadigital.jcyl.es:",
+    "oai:repository.erciyes.edu.tr:",
+    "oai:krm.or.kr:",
+    "oai:hypotheses.org:%",
+]
+
 RELEASE_STAGE_MAP = {
-    'info:eu-repo/semantics/draftVersion':     'draft',
-    'info:eu-repo/semantics/submittedVersion': 'submitted',
-    'info:eu-repo/semantics/acceptedVersion':  'accepted',
-    'info:eu-repo/semantics/publishedVersion': 'published',
-    'info:eu-repo/semantics/updatedVersion':   'updated',
+    "info:eu-repo/semantics/draftVersion": "draft",
+    "info:eu-repo/semantics/submittedVersion": "submitted",
+    "info:eu-repo/semantics/acceptedVersion": "accepted",
+    "info:eu-repo/semantics/publishedVersion": "published",
+    "info:eu-repo/semantics/updatedVersion": "updated",
 }
 
+
 def canon(s):
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj):
     """
     Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -50,38 +80,43 @@ def transform(obj):
     """
 
     requests = []
-    if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+    if not obj.get("oai") or not obj["oai"].startswith("oai:"):
         return []
-    if not obj.get('urls'):
+    if not obj.get("urls"):
         return []
 
+    oai_id = obj["oai"].lower()
+    for prefix in OAI_BLOCKLIST:
+        if oai_id.startswith(prefix):
+            return []
+
     # look in obj['formats'] for PDF?
-    if obj.get('formats'):
+    if obj.get("formats"):
         # if there is a list of formats, and it does not contain PDF, then
         # skip. Note that we will continue if there is no formats list.
         has_pdf = False
-        for f in obj['formats']:
-            if 'pdf' in f.lower():
+        for f in obj["formats"]:
+            if "pdf" in f.lower():
                 has_pdf = True
         if not has_pdf:
             return []
 
     doi = None
-    if obj.get('doi'):
-        doi = obj['doi'][0].lower().strip()
-        if not doi.startswith('10.'):
+    if obj.get("doi"):
+        doi = obj["doi"][0].lower().strip()
+        if not doi.startswith("10."):
             doi = None
 
     # infer release stage and/or type from obj['types']
     release_stage = None
-    for t in obj.get('types', []):
+    for t in obj.get("types", []):
         if t in RELEASE_STAGE_MAP:
             release_stage = RELEASE_STAGE_MAP[t]
 
     # TODO: infer rel somehow? Eg, repository vs. OJS publisher
     rel = None
 
-    for url in obj['urls']:
+    for url in obj["urls"]:
         skip = False
         for domain in DOMAIN_BLOCKLIST:
             if domain in url:
@@ -94,23 +129,25 @@ def transform(obj):
             continue
 
         request = {
-            'base_url': base_url,
-            'ingest_type': 'pdf',
-            'link_source': 'oai',
-            'link_source_id': obj['oai'].lower(),
-            'ingest_request_source': 'metha-bulk',
-            'release_stage': release_stage,
-            'rel': rel,
-            'ext_ids': {
-                'doi': doi,
-                'oai': obj['oai'].lower(),
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "oai",
+            "link_source_id": oai_id,
+            "ingest_request_source": "metha-bulk",
+            "release_stage": release_stage,
+            "rel": rel,
+            "ext_ids": {
+                "oai": obj["oai"].lower(),
             },
-            'edit_extra': {},
+            "edit_extra": {},
         }
+        if doi:
+            request["ext_ids"]["doi"] = doi
         requests.append(request)
 
     return requests
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -121,17 +158,20 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file",
         help="OAI-PMH dump file to use (usually stdin)",
-        type=argparse.FileType('r'))
+        type=argparse.FileType("r"),
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index e093dc3..8b57c5b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 """
 Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
 
@@ -7,6 +6,7 @@ Originally used to benchmark and compare file size/quality.
 """
 
 import sys
+
 import poppler
 from PIL import Image
 
@@ -22,13 +22,16 @@ def run(inpath, outpath):
 
     renderer = poppler.PageRenderer()
     full_page = renderer.render_page(page)
-    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "RGBA", 0, 1)
-    img.thumbnail((180,300), Image.BICUBIC)
-    #img.thumbnail((360,600), Image.BICUBIC)
+    img = Image.frombuffer(
+        "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
+    )
+    img.thumbnail((180, 300), Image.BICUBIC)
+    # img.thumbnail((360,600), Image.BICUBIC)
     img.save(outpath)
-    #img.save(outpath, quality=95)
+    # img.save(outpath, quality=95)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     if len(sys.argv) != 3:
         print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
         sys.exit(-1)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 5536e6c..cb64a1a 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,41 +1,39 @@
 #!/usr/bin/env python3
-
 """
 Transform an unpaywall dump (JSON) into ingest requests.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+
 import urlcanon
 
 DOMAIN_BLOCKLIST = [
     # large OA publishers (we get via DOI)
-
     # large repos and aggregators (we crawl directly)
     "://arxiv.org/",
     "://europepmc.org/",
     "ncbi.nlm.nih.gov/",
-    "semanticscholar.org/",
     "://doi.org/",
     "zenodo.org/",
     "figshare.com/",
-    "://archive.org/",
-    ".archive.org/",
 ]
 
 RELEASE_STAGE_MAP = {
-    'draftVersion':     'draft',
-    'submittedVersion': 'submitted',
-    'acceptedVersion':  'accepted',
-    'publishedVersion': 'published',
-    'updatedVersion':   'updated',
+    "draftVersion": "draft",
+    "submittedVersion": "submitted",
+    "acceptedVersion": "accepted",
+    "publishedVersion": "published",
+    "updatedVersion": "updated",
 }
 
+
 def canon(s):
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj):
     """
     Transforms from a single unpaywall object to zero or more ingest requests.
@@ -43,48 +41,49 @@ def transform(obj):
     """
 
     requests = []
-    if not obj['doi'].startswith('10.'):
+    if not obj["doi"].startswith("10."):
         return requests
-    if not obj['oa_locations']:
+    if not obj["oa_locations"]:
         return requests
 
-    for location in obj['oa_locations']:
-        if not location['url_for_pdf']:
+    for location in obj["oa_locations"]:
+        if not location["url_for_pdf"]:
             continue
         skip = False
         for domain in DOMAIN_BLOCKLIST:
-            if domain in location['url_for_pdf']:
+            if domain in location["url_for_pdf"]:
                 skip = True
         if skip:
             continue
         try:
-            base_url = canon(location['url_for_pdf'])
+            base_url = canon(location["url_for_pdf"])
         except UnicodeEncodeError:
             continue
 
         request = {
-            'base_url': base_url,
-            'ingest_type': 'pdf',
-            'link_source': 'unpaywall',
-            'link_source_id': obj['doi'].lower(),
-            'ingest_request_source': 'unpaywall',
-            'release_stage': RELEASE_STAGE_MAP.get(location['version']),
-            'rel': location['host_type'],
-            'ext_ids': {
-                'doi': obj['doi'].lower(),
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "unpaywall",
+            "link_source_id": obj["doi"].lower(),
+            "ingest_request_source": "unpaywall",
+            "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+            "rel": location["host_type"],
+            "ext_ids": {
+                "doi": obj["doi"].lower(),
             },
-            'edit_extra': {},
+            "edit_extra": {},
         }
-        if obj.get('oa_status'):
-            request['edit_extra']['oa_status'] = obj['oa_status']
-        if location.get('evidence'):
-            request['edit_extra']['evidence'] = location['evidence']
-        if location['pmh_id']:
-            request['ext_ids']['pmh_id'] = location['pmh_id']
+        if obj.get("oa_status"):
+            request["edit_extra"]["oa_status"] = obj["oa_status"]
+        if location.get("evidence"):
+            request["edit_extra"]["evidence"] = location["evidence"]
+        if location["pmh_id"]:
+            request["ext_ids"]["pmh_id"] = location["pmh_id"]
         requests.append(request)
 
     return requests
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -95,17 +94,18 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
-        help="unpaywall dump file to use",
-        type=argparse.FileType('r'))
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()