From 716483103dd7fdfe7aab2982c51abae6d3f4271b Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 25 Sep 2019 18:01:29 -0700
Subject: move a bunch of random old scripts to subdir

---
 python/deliver_dumpgrobid_to_s3.py         | 124 -------------------
 python/deliver_gwb_to_disk.py              | 166 --------------------------
 python/deliver_gwb_to_s3.py                | 184 -----------------------------
 python/enrich_scored_matches.py            |  45 -------
 python/filter_grobid_metadata.py           | 159 -------------------------
 python/filter_groupworks.py                | 144 ----------------------
 python/filter_scored_matches.py            | 116 ------------------
 python/import_grobid_metadata.py           |  94 ---------------
 python/manifest_converter.py               |  56 ---------
 python/scripts/deliver_dumpgrobid_to_s3.py | 124 +++++++++++++++++++
 python/scripts/deliver_gwb_to_disk.py      | 166 ++++++++++++++++++++++++++
 python/scripts/deliver_gwb_to_s3.py        | 184 +++++++++++++++++++++++++++++
 python/scripts/enrich_scored_matches.py    |  45 +++++++
 python/scripts/filter_grobid_metadata.py   | 159 +++++++++++++++++++++++++
 python/scripts/filter_groupworks.py        | 144 ++++++++++++++++++++++
 python/scripts/filter_scored_matches.py    | 116 ++++++++++++++++++
 python/scripts/import_grobid_metadata.py   |  94 +++++++++++++++
 python/scripts/manifest_converter.py       |  56 +++++++++
 18 files changed, 1088 insertions(+), 1088 deletions(-)
 delete mode 100755 python/deliver_dumpgrobid_to_s3.py
 delete mode 100755 python/deliver_gwb_to_disk.py
 delete mode 100755 python/deliver_gwb_to_s3.py
 delete mode 100755 python/enrich_scored_matches.py
 delete mode 100755 python/filter_grobid_metadata.py
 delete mode 100755 python/filter_groupworks.py
 delete mode 100755 python/filter_scored_matches.py
 delete mode 100755 python/import_grobid_metadata.py
 delete mode 100755 python/manifest_converter.py
 create mode 100755 python/scripts/deliver_dumpgrobid_to_s3.py
 create mode 100755 python/scripts/deliver_gwb_to_disk.py
 create mode 100755 python/scripts/deliver_gwb_to_s3.py
 create mode 100755 python/scripts/enrich_scored_matches.py
 create mode 100755 python/scripts/filter_grobid_metadata.py
 create mode 100755 python/scripts/filter_groupworks.py
 create mode 100755 python/scripts/filter_scored_matches.py
 create mode 100755 python/scripts/import_grobid_metadata.py
 create mode 100755 python/scripts/manifest_converter.py

diff --git a/python/deliver_dumpgrobid_to_s3.py b/python/deliver_dumpgrobid_to_s3.py
deleted file mode 100755
index ac0949c..0000000
--- a/python/deliver_dumpgrobid_to_s3.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump
-(from HBase) to AWS S3.
-
-See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
-this script for that specific use-case.
-
-Script takes:
-- input TSV: `sha1_hex, json (including grobid0:tei_xml)`
-    => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered
-       down (eg, by join by SHA-1) to a specific manifest
-- AWS S3 bucket and prefix
-
-AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-
-Output:
-- errors/stats to stderr
-- log to stdout (redirect to file), prefixed by sha1
-
-Requires:
-- raven (sentry)
-- boto3 (AWS S3 client library)
-"""
-
-import os
-import sys
-import json
-import base64
-import hashlib
-import argparse
-from collections import Counter
-
-import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
-
-def b32_hex(s):
-    """copy/pasta from elsewhere"""
-    s = s.strip().split()[0].lower()
-    if s.startswith("sha1:"):
-        s = s[5:]
-    if len(s) != 32:
-        return s
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
-
-class DeliverDumpGrobidS3():
-
-    def __init__(self, s3_bucket, **kwargs):
-        self.rstore = None
-        self.count = Counter()
-        self.s3_bucket = s3_bucket
-        self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
-        self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
-        self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
-        self.s3 = boto3.resource('s3')
-        self.bucket = self.s3.Bucket(self.s3_bucket)
-
-    def run(self, dump_file):
-        sys.stderr.write("Starting...\n")
-        for line in dump_file:
-            line = line.strip().split('\t')
-            if len(line) != 2:
-                self.count['skip-line'] += 1
-                continue
-            sha1_hex, grobid_json = line[0], line[1]
-            if len(sha1_hex) != 40:
-                sha1_hex = b32_hex(sha1_hex)
-            assert len(sha1_hex) == 40
-            grobid = json.loads(grobid_json)
-            tei_xml = grobid.get('tei_xml')
-            if not tei_xml:
-                print("{}\tskip empty".format(sha1_hex))
-                self.count['skip-empty'] += 1
-                continue
-            tei_xml = tei_xml.encode('utf-8')
-            # upload to AWS S3
-            obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix,
-                    StorageClass=self.s3_storage_class),
-                Body=tei_xml)
-            print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
-            self.count['success-s3'] += 1
-        sys.stderr.write("{}\n".format(self.count))
-
-@sentry_client.capture_exceptions
-def main():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--s3-bucket',
-                        required=True,
-                        type=str,
-                        help='AWS S3 bucket to upload into')
-    parser.add_argument('--s3-prefix',
-                        type=str,
-                        default="grobid/",
-                        help='key prefix for items created in bucket')
-    parser.add_argument('--s3-suffix',
-                        type=str,
-                        default=".tei.xml",
-                        help='file suffix for created objects')
-    parser.add_argument('--s3-storage-class',
-                        type=str,
-                        default="STANDARD",
-                        help='AWS S3 storage class (redundancy) to use')
-    parser.add_argument('dump_file',
-                        help="TSV/JSON dump file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
-    args = parser.parse_args()
-
-    worker = DeliverDumpGrobidS3(**args.__dict__)
-    worker.run(args.dump_file)
-
-if __name__ == '__main__': # pragma: no cover
-    main()
diff --git a/python/deliver_gwb_to_disk.py b/python/deliver_gwb_to_disk.py
deleted file mode 100755
index 3dcf962..0000000
--- a/python/deliver_gwb_to_disk.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tool for bulk copying of PDFs (or other files) from GWB to local disk.
-"""
-
-# XXX: some broken MRO thing going on in here due to python3 object wrangling
-# in `wayback` library. Means we can't run pylint.
-# pylint: skip-file
-
-import os
-import sys
-import json
-import base64
-import hashlib
-import argparse
-from collections import Counter
-
-import raven
-import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
-
-class DeliverGwbDisk:
-
-    def __init__(self, disk_dir, **kwargs):
-        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
-        self.rstore = None
-        self.count = Counter()
-        # /serve/ instead of /download/ doesn't record view count
-        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
-        # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
-        self.disk_dir = disk_dir
-        self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
-        self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
-
-    def fetch_warc_content(self, warc_path, offset, c_size):
-        warc_uri = self.warc_uri_prefix + warc_path
-        if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
-        try:
-            gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
-        except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
-        except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
-        except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
-        except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
-        # Note: could consider a generic "except Exception" here, as we get so
-        # many petabox errors. Do want jobs to fail loud and clear when the
-        # whole cluster is down though.
-
-        if gwb_record.get_status()[0] != 200:
-            return None, dict(status="error",
-                reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
-
-        try:
-            raw_content = gwb_record.open_raw_content().read()
-        except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
-        return raw_content, None
-
-    def run(self, manifest_file):
-        sys.stderr.write("Ensuring all 65536 base directories exist...\n")
-        for i in range(256):
-            for j in range(256):
-                fpath = "{}/{}{:02x}/{:02x}".format(
-                        self.disk_dir,
-                        self.disk_prefix,
-                        i,
-                        j)
-                os.makedirs(fpath, exist_ok=True)
-        sys.stderr.write("Starting...\n")
-        for line in manifest_file:
-            self.count['total'] += 1
-            line = line.strip().split('\t')
-            if len(line) != 2:
-                self.count['skip-line'] += 1
-                continue
-            sha1_hex, cdx_json = line[0], line[1]
-            assert len(sha1_hex) == 40
-            file_cdx = json.loads(cdx_json)
-            # If warc is not item/file.(w)arc.gz form, skip it
-            if len(file_cdx['warc'].split('/')) != 2:
-                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
-                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
-                self.count['skip-warc'] += 1
-                continue
-            # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
-            if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
-                self.count['err-petabox-fetch'] += 1
-                continue
-            elif not blob:
-                print("{}\tskip-empty-blob".format(sha1_hex))
-                self.count['skip-empty-blob'] += 1
-                continue
-            # verify sha1
-            if sha1_hex != hashlib.sha1(blob).hexdigest():
-                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
-                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
-                print("{}\terror petabox-hash-mismatch".format(sha1_hex))
-                self.count['err-petabox-hash-mismatch'] += 1
-
-            self.count['petabox-ok'] += 1
-            # save to disk
-            fpath = "{}/{}{}/{}/{}{}".format(
-                    self.disk_dir,
-                    self.disk_prefix,
-                    sha1_hex[0:2],
-                    sha1_hex[2:4],
-                    sha1_hex,
-                    self.disk_suffix)
-            with open(fpath, 'wb') as f:
-                f.write(blob)
-            print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
-            self.count['success-disk'] += 1
-        sys.stderr.write("{}\n".format(self.count))
-
-@sentry_client.capture_exceptions
-def main():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--disk-dir',
-                        required=True,
-                        type=str,
-                        help='local base directory to save into')
-    parser.add_argument('--disk-prefix',
-                        type=str,
-                        default="pdf/",
-                        help='directory prefix for items created in bucket')
-    parser.add_argument('--disk-suffix',
-                        type=str,
-                        default=".pdf",
-                        help='file suffix for created files')
-    parser.add_argument('--warc-uri-prefix',
-                        type=str,
-                        default='https://archive.org/serve/',
-                        help='URI where WARCs can be found')
-    parser.add_argument('manifest_file',
-                        help="TSV/JSON manifest file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
-    args = parser.parse_args()
-
-    worker = DeliverGwbDisk(**args.__dict__)
-    worker.run(args.manifest_file)
-
-if __name__ == '__main__': # pragma: no cover
-    main()
diff --git a/python/deliver_gwb_to_s3.py b/python/deliver_gwb_to_s3.py
deleted file mode 100755
index 39ac000..0000000
--- a/python/deliver_gwb_to_s3.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tool for bulk copying of PDFs (or other files) from GWB to AWS S3.
-
-See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
-this script for that specific use-case.
-
-Script takes:
-- input TSV: `sha1_hex, file:cdx (json)`
-    => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest
-- AWS S3 bucket and prefix
-
-AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-
-GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/.
-
-20x threads on a single machine can process about 340k files in 3 hours; that's
-roughly 6 hours per million per host with 32 threads, or 5k files an hour
-(1.6/second) per thread. Two large machines should be able to upload 10 million
-files in about 30 hours.
-
-Output:
-- errors/stats to stderr
-- log to stdout (redirect to file), prefixed by sha1
-
-Requires:
-- raven (sentry)
-- boto3 (AWS S3 client library)
-- wayback/GWB libraries
-"""
-
-# XXX: some broken MRO thing going on in here due to python3 object wrangling
-# in `wayback` library. Means we can't run pylint.
-# pylint: skip-file
-
-import os
-import sys
-import json
-import base64
-import hashlib
-import argparse
-from collections import Counter
-
-import boto3
-import raven
-import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
-
-class DeliverGwbS3:
-
-    def __init__(self, s3_bucket, **kwargs):
-        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
-        self.rstore = None
-        self.count = Counter()
-        # /serve/ instead of /download/ doesn't record view count
-        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
-        # gwb library will fall back to reading from /opt/.petabox/webdata.secret
-        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
-        self.s3_bucket = s3_bucket
-        self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
-        self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
-        self.s3 = boto3.resource('s3')
-        self.bucket = self.s3.Bucket(self.s3_bucket)
-
-    def fetch_warc_content(self, warc_path, offset, c_size):
-        warc_uri = self.warc_uri_prefix + warc_path
-        if not self.rstore:
-            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
-                webdata_secret=self.petabox_webdata_secret,
-                download_base_url=self.petabox_base_url))
-        try:
-            gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
-        except wayback.exception.ResourceUnavailable:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
-        except ValueError as ve:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
-        except EOFError as eofe:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
-        except TypeError as te:
-            return None, dict(status="error",
-                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
-        # Note: could consider a generic "except Exception" here, as we get so
-        # many petabox errors. Do want jobs to fail loud and clear when the
-        # whole cluster is down though.
-
-        if gwb_record.get_status()[0] != 200:
-            return None, dict(status="error",
-                reason="archived HTTP response (WARC) was not 200",
-                warc_status=gwb_record.get_status()[0])
-
-        try:
-            raw_content = gwb_record.open_raw_content().read()
-        except IncompleteRead as ire:
-            return None, dict(status="error",
-                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
-        return raw_content, None
-
-    def run(self, manifest_file):
-        sys.stderr.write("Starting...\n")
-        for line in manifest_file:
-            self.count['total'] += 1
-            line = line.strip().split('\t')
-            if len(line) != 2:
-                self.count['skip-line'] += 1
-                continue
-            sha1_hex, cdx_json = line[0], line[1]
-            assert len(sha1_hex) == 40
-            file_cdx = json.loads(cdx_json)
-            # If warc is not item/file.(w)arc.gz form, skip it
-            if len(file_cdx['warc'].split('/')) != 2:
-                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
-                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
-                self.count['skip-warc'] += 1
-                continue
-            # fetch from GWB/petabox via HTTP range-request
-            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
-            if blob is None and status:
-                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
-                self.count['err-petabox-fetch'] += 1
-                continue
-            elif not blob:
-                print("{}\tskip-empty-blob".format(sha1_hex))
-                self.count['skip-empty-blob'] += 1
-                continue
-            # verify sha1
-            if sha1_hex != hashlib.sha1(blob).hexdigest():
-                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
-                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
-                print("{}\terror petabox-hash-mismatch".format(sha1_hex))
-                self.count['err-petabox-hash-mismatch'] += 1
-
-            self.count['petabox-ok'] += 1
-            # upload to AWS S3
-            obj = self.bucket.put_object(
-                Key="{}{}/{}{}".format(
-                    self.s3_prefix,
-                    sha1_hex[0:4],
-                    sha1_hex,
-                    self.s3_suffix),
-                Body=blob)
-            print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
-            self.count['success-s3'] += 1
-        sys.stderr.write("{}\n".format(self.count))
-
-@sentry_client.capture_exceptions
-def main():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--s3-bucket',
-                        required=True,
-                        type=str,
-                        help='AWS S3 bucket to upload into')
-    parser.add_argument('--s3-prefix',
-                        type=str,
-                        default="pdf/",
-                        help='key prefix for items created in bucket')
-    parser.add_argument('--s3-suffix',
-                        type=str,
-                        default=".pdf",
-                        help='file suffix for created objects')
-    parser.add_argument('--warc-uri-prefix',
-                        type=str,
-                        default='https://archive.org/serve/',
-                        help='URI where WARCs can be found')
-    parser.add_argument('manifest_file',
-                        help="TSV/JSON manifest file",
-                        default=sys.stdin,
-                        type=argparse.FileType('r'))
-    args = parser.parse_args()
-
-    worker = DeliverGwbS3(**args.__dict__)
-    worker.run(args.manifest_file)
-
-if __name__ == '__main__': # pragma: no cover
-    main()
diff --git a/python/enrich_scored_matches.py b/python/enrich_scored_matches.py
deleted file mode 100755
index 9fe1499..0000000
--- a/python/enrich_scored_matches.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-"""
-Takes an "joined" TSV input stream:
-
-- sha1
-- dois (JSON list)
-- cdx (JSON object)
-    - url
-    - dt
-    (etc)
-- mimetype
-- size (integer)
-
-And outputs JSON objects that are can be imported into fatcat with the
-"matched" script.
-
-No dependencies (only python3 stdlib)
-"""
-
-import sys
-import json
-import base64
-
-def run():
-    for line in sys.stdin:
-        line = line.split('\t')
-        assert len(line) == 5
-        raw_sha1 = line[0].replace('sha1:', '')
-        dois = json.loads(line[1])
-        cdx = json.loads(line[2])
-        mimetype = line[3]
-        size = int(line[4])
-
-        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
-
-        obj = dict(
-            sha1=sha1,
-            dois=dois,
-            cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
-            size=size,
-            mimetype=mimetype)
-        print(json.dumps(obj))
-
-if __name__=='__main__':
-    run()
diff --git a/python/filter_grobid_metadata.py b/python/filter_grobid_metadata.py
deleted file mode 100755
index c33ab86..0000000
--- a/python/filter_grobid_metadata.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-
-with open('title_slug_blacklist.txt', 'r') as f:
-    TITLE_BLACKLIST = [l.strip() for l in f]
-
-TITLE_BLACKLIST.extend((
-    'editorial',
-    'advertisement',
-    'bookreviews',
-    'reviews',
-    'nr',
-    'abstractoriginalarticle',
-    'originalarticle',
-    'impactfactor',
-    'articlenumber',
-))
-
-# The full name can't *entirely* be one of these
-NAME_BLACKLIST = (
-    'phd',
-    'phdstudent',
-)
-
-def tokenize(s, remove_whitespace=True):
-
-    s.replace('&apos;', "'")
-    # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
-
-    if remove_whitespace:
-        s = ''.join(s.split())
-
-    # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
-
-assert tokenize("Impact Factor: 2.114") == "impactfactor"
-assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
-
-def filter_title(title):
-
-    title = title.strip()
-    if len(title) > 500:
-        return None
-    title_slug = tokenize(title, remove_whitespace=True)
-    if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
-        return None
-    if title_slug.startswith('nr'):
-        return None
-    if title.lower().replace('.', '').startswith('int j '):
-        return None
-
-    for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
-        if title.startswith(prefix):
-            title.replace(prefix, '')
-
-    if title.startswith("The Journal of "):
-        return None
-
-    if "volume" in title_slug and "issue" in title_slug:
-        return None
-
-    if "downloadedfrom" in title_slug:
-        return None
-
-    if title_slug.startswith("issn"):
-        return None
-
-    # titles with too many or too few words in title
-    title_words = len(title.split())
-    if title_words > 50 or title_words < 2:
-        return None
-
-    # titles with spaces between every letter (more than N such single-char words)
-    if len([True for w in title.split() if len(w) == 1]) > 12:
-        return None
-
-    # too deep subtitling/splitting
-    if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
-        return None
-
-    return title
-
-def filter_author_name(name):
-    name = name['name']
-    if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
-        return None
-    return ' '.join([t for t in name.split() if tokenize(t)])
-
-def filter_authors(l):
-    return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
-
-def filter_refs(l):
-    # TODO:
-    return l
-
-def filter_journal_name(name):
-    # same blacklist, for now
-    if not name:
-        return None
-    name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
-    slug_name = tokenize(name)
-    if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
-        return None
-    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
-        if name.startswith(prefix):
-            name = name.replace(prefix, '')
-    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
-        if name.endswith(suffix):
-            name = name.replace(suffix, '')
-    if "====================" in name:
-        return None
-    if len(name) > 150:
-        return None
-    return ' '.join(name.split())
-
-def filter_metadata(obj):
-    if not (obj.get('title') and obj.get('authors')):
-        return None
-
-    title = filter_title(obj['title'])
-    if not title:
-        #sys.stderr.write("bad title\n")
-        return None
-    else:
-        obj['title'] = title
-    obj['authors'] = filter_authors(obj['authors'])
-    obj['citations'] = filter_refs(obj['citations'])
-    obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
-
-    return obj
-
-def run(invert=False):
-    for line in sys.stdin:
-        fields = line.split('\t')
-        if len(fields) == 5:
-            raw = fields[4]
-        elif len(fields) == 1:
-            raw = fields[0]
-        else:
-            sys.stderr.write("bad line\n")
-            continue
-        obj = json.loads(raw)
-        processed = filter_metadata(obj)
-        if processed:
-            if not invert:
-                processed = json.dumps(processed)
-                if len(fields) == 5:
-                    fields[4] = processed
-                else:
-                    fields[0] = processed
-                print('\t'.join(fields))
-        elif invert:
-            print(raw.strip())
-
-if __name__=="__main__":
-    run(invert="--invert" in sys.argv)
diff --git a/python/filter_groupworks.py b/python/filter_groupworks.py
deleted file mode 100755
index bbba770..0000000
--- a/python/filter_groupworks.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-"""
-Filters an input stream of sorted "groupworks" scalding job, and outputs
-"good enough" matches to be merged in fatcat.
-
-Output is JSON lines which are arrays of releases that could/should be merged
-together, either as multiple releases under a single work, or releases merged
-into a single entity (via redirects).
-
-Note that releases *should* only end up on a single line, and only once per
-line!
-
-No dependencies (only python3 stdlib)
-
-Note: the actual importer/merger should filter the following patterns out:
-- container title has "letter" and "diar"
-- contribs (authors) contain "&NA;"
-- dates differ (not just year)
-"""
-
-import sys
-import json
-
-# out of 1000
-SCORE_THRESHOLD = 900
-
-MAX_SLUG_LINES = 50
-
-REQUIRE_AUTHORS = False
-
-def tokenize(s, remove_whitespace=False):
-
-    s.replace('&apos;', "'")
-    # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
-
-    if remove_whitespace:
-        s = ''.join(s.split())
-
-    # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').replace(b'?', b'')
-
-def check_authors(left, right):
-    """
-    Intended to check GROBID extracted authors (right) against "known good"
-    (but maybe not perfect) Crossref metadata authors ("left").
-    """
-    if not left and not right:
-        return bool(not REQUIRE_AUTHORS)
-    if len(left) != len(right):
-        return False
-    right_all = tokenize(" ".join(right))
-    for i in range(len(left)):
-        l = left[i].lower().replace('jr.', '').split()
-        if not l:
-            return False
-        l = tokenize(l[-1])
-        if len(l) <= 1:
-            # weird author name (single char)
-            return False
-        if l not in right_all:
-            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
-            return False
-    return True
-
-def test_check_authors():
-    assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
-    assert not check_authors([], ['one'])
-    assert check_authors(['one'], ['one'])
-    assert check_authors(['one two'], ['One Two'])
-    assert check_authors(['two'], ['One Two'])
-    assert check_authors(['two'], ['two, one'])
-    assert check_authors(['mago'], ['Mr. Magoo'])
-    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
-
-# Rows are (score, left, right)
-def process_group(rows):
-
-    # first pass reduces size of list and generates a linkage graph
-    filtered = list()
-    for row in rows:
-        score = int(row[0])
-        if score < SCORE_THRESHOLD:
-            continue
-        left = json.loads(row[1])
-        right = json.loads(row[2])
-        # authors must roughly match
-        if not check_authors(left['authors'], right['authors']):
-            continue
-        # years must match (if defined)
-        if left['year'] and right['year'] and left['year'] != right['year']:
-            continue
-        filtered.append((left, right))
-
-    if not filtered:
-        return
-
-    # second pass finds a connected graph and returns that
-    releases = dict()
-    group_ids = set()
-    for row in filtered[1:]:
-        (left, right) = row
-        l_id = left['fatcat_release']
-        r_id = right['fatcat_release']
-        releases[l_id] = left
-        releases[r_id] = right
-        if not group_ids:
-            group_ids.add(l_id)
-            group_ids.add(r_id)
-            continue
-        if l_id in group_ids or r_id in group_ids:
-            group_ids.add(l_id)
-            group_ids.add(r_id)
-            continue
-
-    if not group_ids:
-        return
-
-    print(json.dumps([releases[ident] for ident in group_ids]))
-
-def run():
-
-    last_slug = None
-    lines = []
-
-    # group lines by slug, and process in batches
-    for line in sys.stdin:
-        line = line.strip().split('\t')
-        assert len(line) == 4
-        slug = line[0]
-        if last_slug and slug != last_slug and lines:
-            if len(lines) <= MAX_SLUG_LINES:
-                process_group(lines)
-            lines = []
-        last_slug = slug
-        lines.append(line[1:])
-
-    # catch any remaining
-    if lines:
-        process_group(lines)
-
-if __name__=='__main__':
-    run()
diff --git a/python/filter_scored_matches.py b/python/filter_scored_matches.py
deleted file mode 100755
index 3654b87..0000000
--- a/python/filter_scored_matches.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/env python3
-"""
-Filters an input stream of sorted "matchcrossref" scalding job, and outputs
-"good enough" matches to be inserted to fatcat.
-
-Currently works on DOI numbers. Filters for a high enough string match (doesn't
-re-do title match), and checks author lists. Filters out slugs with too many
-matches, and outputs one-line-per-sha1 (aka, file).
-
-No dependencies (only python3 stdlib)
-"""
-
-import sys
-import json
-
-# out of 1000
-score_threshold = 900
-
-max_slug_lines = 10
-
-require_authors = 1
-
-
-def tokenize(s, remove_whitespace=False):
-
-    s.replace('&apos;', "'")
-    # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
-
-    if remove_whitespace:
-        s = ''.join(s.split())
-
-    # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').replace(b'?', b'')
-
-def check_authors(left, right):
-    """
-    Intended to check GROBID extracted authors (right) against "known good"
-    (but maybe not perfect) Crossref metadata authors ("left").
-    """
-    if not left:
-        return False
-    if len(left) > len(right):
-        return False
-    right_all = tokenize(" ".join(right))
-    for i in range(len(left)):
-        l = left[i].lower().replace('jr.', '').split()
-        if not l:
-            return False
-        l = tokenize(l[-1])
-        if len(l) <= 1:
-            # weird author name (single char)
-            return False
-        if l not in right_all:
-            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
-            return False
-    return True
-
-def test_check_authors():
-    assert not check_authors([], [])
-    assert not check_authors([], ['one'])
-    assert check_authors(['one'], ['one'])
-    assert check_authors(['one two'], ['One Two'])
-    assert check_authors(['two'], ['One Two'])
-    assert check_authors(['two'], ['two, one'])
-    assert check_authors(['mago'], ['Mr. Magoo'])
-    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
-
-# Rows are (score, grobid, crossref)
-def process_group(rows):
-    if len(rows) > max_slug_lines:
-        return
-    keepers = dict()
-    for row in rows:
-        score = int(row[0])
-        if score < score_threshold:
-            continue
-        grobid = json.loads(row[1])
-        crossref = json.loads(row[2])
-        if not check_authors(crossref['authors'], grobid['authors']):
-            #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
-            continue
-        else:
-            #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
-            pass
-        sha1 = grobid['sha1']
-        doi = crossref['doi'].lower()
-        l = keepers.get(sha1, list())
-        l.append(doi)
-        keepers[sha1] = l
-    for sha1, doi_list in keepers.items():
-        print("{}\t{}".format(sha1, json.dumps(doi_list)))
-
-def run():
-
-    last_slug = None
-    lines = []
-
-    # group lines by slug, and process in batches
-    for line in sys.stdin:
-        line = line.strip().split('\t')
-        assert len(line) == 4
-        slug = line[0]
-        if last_slug and slug != last_slug and lines:
-            process_group(lines)
-            lines = []
-        last_slug = slug
-        lines.append(line[1:])
-
-    # catch any remaining
-    if lines:
-        process_group(lines)
-
-if __name__=='__main__':
-    run()
diff --git a/python/import_grobid_metadata.py b/python/import_grobid_metadata.py
deleted file mode 100755
index 3d2e14c..0000000
--- a/python/import_grobid_metadata.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-import datetime
-
-MAX_ABSTRACT_BYTES=4096
-
-def parse_grobid_json(obj):
-
-    if not obj.get('title'):
-        return None
-
-    extra = dict()
-
-    if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
-        abobj = dict(
-            mimetype="text/plain",
-            language=None,
-            content=obj.get('abstract').strip())
-        abstracts = [abobj]
-    else:
-        abstracts = None
-
-    contribs = []
-    for a in obj.get('authors', []):
-        c = dict(raw_name=a, role="author")
-        contribs.append(c)
-
-    refs = []
-    for raw in obj.get('citations', []):
-        extra = dict()
-        ref = dict()
-        ref['key'] = raw.get('id')
-        if raw.get('title'):
-            ref['title'] = raw['title'].strip()
-        if raw.get('date'):
-            try:
-                year = int(raw['date'].strip()[:4])
-                ref['year'] = year
-            except:
-                pass
-        for key in ('volume', 'url', 'issue', 'publisher'):
-            if raw.get(key):
-                extra[key] = raw[key].strip()
-        if raw.get('authors'):
-            extra['authors'] = [a['name'] for a in raw['authors']]
-        if extra:
-            extra = dict(grobid=extra)
-        else:
-            extra = None
-        ref['extra'] = extra
-        refs.append(ref)
-
-    release_type = "journal-article"
-    release_date = None
-    if obj.get('date'):
-        # TODO: only returns year, ever? how to handle?
-        release_date = datetime.datetime(year=obj['date'], month=1, day=1)
-
-    if obj.get('doi'):
-        extra['doi'] = obj['doi']
-    if obj['journal'].get('name'):
-        extra['container_name'] = obj['journal']['name']
-
-    extra['is_longtail_oa'] = True
-
-    # TODO: ISSN/eISSN handling? or just journal name lookup?
-
-    if extra:
-        extra = dict(grobid=extra)
-    else:
-        extra = None
-
-    return dict(
-        title=obj['title'].strip(),
-        contribs=contribs,
-        publisher=obj['journal'].get('publisher'),
-        volume=obj['journal'].get('volume'),
-        issue=obj['journal'].get('issue'),
-        abstracts=abstracts,
-        release_type=release_type,
-        release_date=release_date,
-        extra=extra)
-
-def run():
-    for line in sys.stdin:
-        obj = json.loads(line)
-        out = parse_grobid_json(obj)
-        if out:
-            print(out)
-
-if __name__=="__main__":
-    run()
diff --git a/python/manifest_converter.py b/python/manifest_converter.py
deleted file mode 100755
index 35cee5b..0000000
--- a/python/manifest_converter.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env python3
-"""
-Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of
-"match" JSON objects which can be imported into fatcat with matched_import.py
-
-This was used to convert this manifest:
-
-    https://archive.org/details/ia_papers_manifest_2018-01-25/
-
-to JSON format for fast fatcat importing.
-"""
-
-import sys
-import json
-import sqlite3
-
-# iterate over rows in files metadata...
-# 1. select all identified DOIs
-#   => filter based on count
-# 2. select all file metadata
-# 3. output object
-
-def or_none(s):
-    if s is None:
-        return None
-    elif type(s) == str and ((not s) or s == "\\N" or s == "-"):
-        return None
-    return s
-
-def process_db(db_path):
-
-    db = sqlite3.connect(db_path)
-
-    for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
-        sha1 = row[0]
-        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
-        dois = [d[0] for d in dois]
-        if not dois:
-            continue
-        urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
-        if not urls:
-            continue
-        cdx = [dict(url=row[0], dt=row[1]) for row in urls]
-        obj = dict(
-            sha1=sha1,
-            mimetype=or_none(row[1]),
-            size=(or_none(row[2]) and int(row[2])),
-            md5=or_none(row[3]),
-            dois=dois,
-            cdx=cdx,
-        )
-        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
-        print(json.dumps(obj))
-
-if __name__=="__main__":
-    process_db(sys.argv[1])
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
new file mode 100755
index 0000000..ac0949c
--- /dev/null
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump
+(from HBase) to AWS S3.
+
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1_hex, json (including grobid0:tei_xml)`
+    => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered
+       down (eg, by join by SHA-1) to a specific manifest
+- AWS S3 bucket and prefix
+
+AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+
+Output:
+- errors/stats to stderr
+- log to stdout (redirect to file), prefixed by sha1
+
+Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
+"""
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import boto3
+import raven
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+def b32_hex(s):
+    """copy/pasta from elsewhere"""
+    s = s.strip().split()[0].lower()
+    if s.startswith("sha1:"):
+        s = s[5:]
+    if len(s) != 32:
+        return s
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+
+class DeliverDumpGrobidS3():
+
+    def __init__(self, s3_bucket, **kwargs):
+        self.rstore = None
+        self.count = Counter()
+        self.s3_bucket = s3_bucket
+        self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
+        self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
+        self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
+        self.s3 = boto3.resource('s3')
+        self.bucket = self.s3.Bucket(self.s3_bucket)
+
+    def run(self, dump_file):
+        sys.stderr.write("Starting...\n")
+        for line in dump_file:
+            line = line.strip().split('\t')
+            if len(line) != 2:
+                self.count['skip-line'] += 1
+                continue
+            sha1_hex, grobid_json = line[0], line[1]
+            if len(sha1_hex) != 40:
+                sha1_hex = b32_hex(sha1_hex)
+            assert len(sha1_hex) == 40
+            grobid = json.loads(grobid_json)
+            tei_xml = grobid.get('tei_xml')
+            if not tei_xml:
+                print("{}\tskip empty".format(sha1_hex))
+                self.count['skip-empty'] += 1
+                continue
+            tei_xml = tei_xml.encode('utf-8')
+            # upload to AWS S3
+            obj = self.bucket.put_object(
+                Key="{}{}/{}{}".format(
+                    self.s3_prefix,
+                    sha1_hex[0:4],
+                    sha1_hex,
+                    self.s3_suffix,
+                    StorageClass=self.s3_storage_class),
+                Body=tei_xml)
+            print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
+            self.count['success-s3'] += 1
+        sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--s3-bucket',
+                        required=True,
+                        type=str,
+                        help='AWS S3 bucket to upload into')
+    parser.add_argument('--s3-prefix',
+                        type=str,
+                        default="grobid/",
+                        help='key prefix for items created in bucket')
+    parser.add_argument('--s3-suffix',
+                        type=str,
+                        default=".tei.xml",
+                        help='file suffix for created objects')
+    parser.add_argument('--s3-storage-class',
+                        type=str,
+                        default="STANDARD",
+                        help='AWS S3 storage class (redundancy) to use')
+    parser.add_argument('dump_file',
+                        help="TSV/JSON dump file",
+                        default=sys.stdin,
+                        type=argparse.FileType('r'))
+    args = parser.parse_args()
+
+    worker = DeliverDumpGrobidS3(**args.__dict__)
+    worker.run(args.dump_file)
+
+if __name__ == '__main__': # pragma: no cover
+    main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
new file mode 100755
index 0000000..3dcf962
--- /dev/null
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk copying of PDFs (or other files) from GWB to local disk.
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import raven
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+class DeliverGwbDisk:
+
+    def __init__(self, disk_dir, **kwargs):
+        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+        self.rstore = None
+        self.count = Counter()
+        # /serve/ instead of /download/ doesn't record view count
+        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+        # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.disk_dir = disk_dir
+        self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
+        self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+
+    def fetch_warc_content(self, warc_path, offset, c_size):
+        warc_uri = self.warc_uri_prefix + warc_path
+        if not self.rstore:
+            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+                webdata_secret=self.petabox_webdata_secret,
+                download_base_url=self.petabox_base_url))
+        try:
+            gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+        except wayback.exception.ResourceUnavailable:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+        except ValueError as ve:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+        except EOFError as eofe:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+        except TypeError as te:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+        # Note: could consider a generic "except Exception" here, as we get so
+        # many petabox errors. Do want jobs to fail loud and clear when the
+        # whole cluster is down though.
+
+        if gwb_record.get_status()[0] != 200:
+            return None, dict(status="error",
+                reason="archived HTTP response (WARC) was not 200",
+                warc_status=gwb_record.get_status()[0])
+
+        try:
+            raw_content = gwb_record.open_raw_content().read()
+        except IncompleteRead as ire:
+            return None, dict(status="error",
+                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+        return raw_content, None
+
+    def run(self, manifest_file):
+        sys.stderr.write("Ensuring all 65536 base directories exist...\n")
+        for i in range(256):
+            for j in range(256):
+                fpath = "{}/{}{:02x}/{:02x}".format(
+                        self.disk_dir,
+                        self.disk_prefix,
+                        i,
+                        j)
+                os.makedirs(fpath, exist_ok=True)
+        sys.stderr.write("Starting...\n")
+        for line in manifest_file:
+            self.count['total'] += 1
+            line = line.strip().split('\t')
+            if len(line) != 2:
+                self.count['skip-line'] += 1
+                continue
+            sha1_hex, cdx_json = line[0], line[1]
+            assert len(sha1_hex) == 40
+            file_cdx = json.loads(cdx_json)
+            # If warc is not item/file.(w)arc.gz form, skip it
+            if len(file_cdx['warc'].split('/')) != 2:
+                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
+                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
+                self.count['skip-warc'] += 1
+                continue
+            # fetch from GWB/petabox via HTTP range-request
+            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            if blob is None and status:
+                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+                self.count['err-petabox-fetch'] += 1
+                continue
+            elif not blob:
+                print("{}\tskip-empty-blob".format(sha1_hex))
+                self.count['skip-empty-blob'] += 1
+                continue
+            # verify sha1
+            if sha1_hex != hashlib.sha1(blob).hexdigest():
+                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
+                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+                print("{}\terror petabox-hash-mismatch".format(sha1_hex))
+                self.count['err-petabox-hash-mismatch'] += 1
+
+            self.count['petabox-ok'] += 1
+            # save to disk
+            fpath = "{}/{}{}/{}/{}{}".format(
+                    self.disk_dir,
+                    self.disk_prefix,
+                    sha1_hex[0:2],
+                    sha1_hex[2:4],
+                    sha1_hex,
+                    self.disk_suffix)
+            with open(fpath, 'wb') as f:
+                f.write(blob)
+            print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
+            self.count['success-disk'] += 1
+        sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--disk-dir',
+                        required=True,
+                        type=str,
+                        help='local base directory to save into')
+    parser.add_argument('--disk-prefix',
+                        type=str,
+                        default="pdf/",
+                        help='directory prefix for items created in bucket')
+    parser.add_argument('--disk-suffix',
+                        type=str,
+                        default=".pdf",
+                        help='file suffix for created files')
+    parser.add_argument('--warc-uri-prefix',
+                        type=str,
+                        default='https://archive.org/serve/',
+                        help='URI where WARCs can be found')
+    parser.add_argument('manifest_file',
+                        help="TSV/JSON manifest file",
+                        default=sys.stdin,
+                        type=argparse.FileType('r'))
+    args = parser.parse_args()
+
+    worker = DeliverGwbDisk(**args.__dict__)
+    worker.run(args.manifest_file)
+
+if __name__ == '__main__': # pragma: no cover
+    main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
new file mode 100755
index 0000000..39ac000
--- /dev/null
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk copying of PDFs (or other files) from GWB to AWS S3.
+
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1_hex, file:cdx (json)`
+    => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest
+- AWS S3 bucket and prefix
+
+AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+
+GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/.
+
+20x threads on a single machine can process about 340k files in 3 hours; that's
+roughly 6 hours per million per host with 32 threads, or 5k files an hour
+(1.6/second) per thread. Two large machines should be able to upload 10 million
+files in about 30 hours.
+
+Output:
+- errors/stats to stderr
+- log to stdout (redirect to file), prefixed by sha1
+
+Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
+- wayback/GWB libraries
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import boto3
+import raven
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+class DeliverGwbS3:
+
+    def __init__(self, s3_bucket, **kwargs):
+        self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+        self.rstore = None
+        self.count = Counter()
+        # /serve/ instead of /download/ doesn't record view count
+        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+        # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+        self.s3_bucket = s3_bucket
+        self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
+        self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
+        self.s3 = boto3.resource('s3')
+        self.bucket = self.s3.Bucket(self.s3_bucket)
+
+    def fetch_warc_content(self, warc_path, offset, c_size):
+        warc_uri = self.warc_uri_prefix + warc_path
+        if not self.rstore:
+            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+                webdata_secret=self.petabox_webdata_secret,
+                download_base_url=self.petabox_base_url))
+        try:
+            gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+        except wayback.exception.ResourceUnavailable:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+        except ValueError as ve:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+        except EOFError as eofe:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+        except TypeError as te:
+            return None, dict(status="error",
+                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+        # Note: could consider a generic "except Exception" here, as we get so
+        # many petabox errors. Do want jobs to fail loud and clear when the
+        # whole cluster is down though.
+
+        if gwb_record.get_status()[0] != 200:
+            return None, dict(status="error",
+                reason="archived HTTP response (WARC) was not 200",
+                warc_status=gwb_record.get_status()[0])
+
+        try:
+            raw_content = gwb_record.open_raw_content().read()
+        except IncompleteRead as ire:
+            return None, dict(status="error",
+                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+        return raw_content, None
+
+    def run(self, manifest_file):
+        sys.stderr.write("Starting...\n")
+        for line in manifest_file:
+            self.count['total'] += 1
+            line = line.strip().split('\t')
+            if len(line) != 2:
+                self.count['skip-line'] += 1
+                continue
+            sha1_hex, cdx_json = line[0], line[1]
+            assert len(sha1_hex) == 40
+            file_cdx = json.loads(cdx_json)
+            # If warc is not item/file.(w)arc.gz form, skip it
+            if len(file_cdx['warc'].split('/')) != 2:
+                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
+                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
+                self.count['skip-warc'] += 1
+                continue
+            # fetch from GWB/petabox via HTTP range-request
+            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+            if blob is None and status:
+                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+                self.count['err-petabox-fetch'] += 1
+                continue
+            elif not blob:
+                print("{}\tskip-empty-blob".format(sha1_hex))
+                self.count['skip-empty-blob'] += 1
+                continue
+            # verify sha1
+            if sha1_hex != hashlib.sha1(blob).hexdigest():
+                #assert sha1_hex == hashlib.sha1(blob).hexdigest()
+                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+                print("{}\terror petabox-hash-mismatch".format(sha1_hex))
+                self.count['err-petabox-hash-mismatch'] += 1
+
+            self.count['petabox-ok'] += 1
+            # upload to AWS S3
+            obj = self.bucket.put_object(
+                Key="{}{}/{}{}".format(
+                    self.s3_prefix,
+                    sha1_hex[0:4],
+                    sha1_hex,
+                    self.s3_suffix),
+                Body=blob)
+            print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
+            self.count['success-s3'] += 1
+        sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--s3-bucket',
+                        required=True,
+                        type=str,
+                        help='AWS S3 bucket to upload into')
+    parser.add_argument('--s3-prefix',
+                        type=str,
+                        default="pdf/",
+                        help='key prefix for items created in bucket')
+    parser.add_argument('--s3-suffix',
+                        type=str,
+                        default=".pdf",
+                        help='file suffix for created objects')
+    parser.add_argument('--warc-uri-prefix',
+                        type=str,
+                        default='https://archive.org/serve/',
+                        help='URI where WARCs can be found')
+    parser.add_argument('manifest_file',
+                        help="TSV/JSON manifest file",
+                        default=sys.stdin,
+                        type=argparse.FileType('r'))
+    args = parser.parse_args()
+
+    worker = DeliverGwbS3(**args.__dict__)
+    worker.run(args.manifest_file)
+
+if __name__ == '__main__': # pragma: no cover
+    main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
new file mode 100755
index 0000000..9fe1499
--- /dev/null
+++ b/python/scripts/enrich_scored_matches.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Takes an "joined" TSV input stream:
+
+- sha1
+- dois (JSON list)
+- cdx (JSON object)
+    - url
+    - dt
+    (etc)
+- mimetype
+- size (integer)
+
+And outputs JSON objects that are can be imported into fatcat with the
+"matched" script.
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+import base64
+
+def run():
+    for line in sys.stdin:
+        line = line.split('\t')
+        assert len(line) == 5
+        raw_sha1 = line[0].replace('sha1:', '')
+        dois = json.loads(line[1])
+        cdx = json.loads(line[2])
+        mimetype = line[3]
+        size = int(line[4])
+
+        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+
+        obj = dict(
+            sha1=sha1,
+            dois=dois,
+            cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+            size=size,
+            mimetype=mimetype)
+        print(json.dumps(obj))
+
+if __name__=='__main__':
+    run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
new file mode 100755
index 0000000..c33ab86
--- /dev/null
+++ b/python/scripts/filter_grobid_metadata.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+with open('title_slug_blacklist.txt', 'r') as f:
+    TITLE_BLACKLIST = [l.strip() for l in f]
+
+TITLE_BLACKLIST.extend((
+    'editorial',
+    'advertisement',
+    'bookreviews',
+    'reviews',
+    'nr',
+    'abstractoriginalarticle',
+    'originalarticle',
+    'impactfactor',
+    'articlenumber',
+))
+
+# The full name can't *entirely* be one of these
+NAME_BLACKLIST = (
+    'phd',
+    'phdstudent',
+)
+
+def tokenize(s, remove_whitespace=True):
+
+    s.replace('&apos;', "'")
+    # Remove non-alphanumeric characters
+    s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+
+    if remove_whitespace:
+        s = ''.join(s.split())
+
+    # Encode as dumb ASCII (TODO: this is horrible)
+    return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+
+assert tokenize("Impact Factor: 2.114") == "impactfactor"
+assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
+
+def filter_title(title):
+
+    title = title.strip()
+    if len(title) > 500:
+        return None
+    title_slug = tokenize(title, remove_whitespace=True)
+    if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
+        return None
+    if title_slug.startswith('nr'):
+        return None
+    if title.lower().replace('.', '').startswith('int j '):
+        return None
+
+    for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
+        if title.startswith(prefix):
+            title.replace(prefix, '')
+
+    if title.startswith("The Journal of "):
+        return None
+
+    if "volume" in title_slug and "issue" in title_slug:
+        return None
+
+    if "downloadedfrom" in title_slug:
+        return None
+
+    if title_slug.startswith("issn"):
+        return None
+
+    # titles with too many or too few words in title
+    title_words = len(title.split())
+    if title_words > 50 or title_words < 2:
+        return None
+
+    # titles with spaces between every letter (more than N such single-char words)
+    if len([True for w in title.split() if len(w) == 1]) > 12:
+        return None
+
+    # too deep subtitling/splitting
+    if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+        return None
+
+    return title
+
+def filter_author_name(name):
+    name = name['name']
+    if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
+        return None
+    return ' '.join([t for t in name.split() if tokenize(t)])
+
+def filter_authors(l):
+    return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
+def filter_refs(l):
+    # TODO:
+    return l
+
+def filter_journal_name(name):
+    # same blacklist, for now
+    if not name:
+        return None
+    name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+    slug_name = tokenize(name)
+    if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
+        return None
+    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+        if name.startswith(prefix):
+            name = name.replace(prefix, '')
+    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+        if name.endswith(suffix):
+            name = name.replace(suffix, '')
+    if "====================" in name:
+        return None
+    if len(name) > 150:
+        return None
+    return ' '.join(name.split())
+
+def filter_metadata(obj):
+    if not (obj.get('title') and obj.get('authors')):
+        return None
+
+    title = filter_title(obj['title'])
+    if not title:
+        #sys.stderr.write("bad title\n")
+        return None
+    else:
+        obj['title'] = title
+    obj['authors'] = filter_authors(obj['authors'])
+    obj['citations'] = filter_refs(obj['citations'])
+    obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+
+    return obj
+
+def run(invert=False):
+    for line in sys.stdin:
+        fields = line.split('\t')
+        if len(fields) == 5:
+            raw = fields[4]
+        elif len(fields) == 1:
+            raw = fields[0]
+        else:
+            sys.stderr.write("bad line\n")
+            continue
+        obj = json.loads(raw)
+        processed = filter_metadata(obj)
+        if processed:
+            if not invert:
+                processed = json.dumps(processed)
+                if len(fields) == 5:
+                    fields[4] = processed
+                else:
+                    fields[0] = processed
+                print('\t'.join(fields))
+        elif invert:
+            print(raw.strip())
+
+if __name__=="__main__":
+    run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
new file mode 100755
index 0000000..bbba770
--- /dev/null
+++ b/python/scripts/filter_groupworks.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Filters an input stream of sorted "groupworks" scalding job, and outputs
+"good enough" matches to be merged in fatcat.
+
+Output is JSON lines which are arrays of releases that could/should be merged
+together, either as multiple releases under a single work, or releases merged
+into a single entity (via redirects).
+
+Note that releases *should* only end up on a single line, and only once per
+line!
+
+No dependencies (only python3 stdlib)
+
+Note: the actual importer/merger should filter the following patterns out:
+- container title has "letter" and "diar"
+- contribs (authors) contain "&NA;"
+- dates differ (not just year)
+"""
+
+import sys
+import json
+
+# out of 1000
+SCORE_THRESHOLD = 900
+
+MAX_SLUG_LINES = 50
+
+REQUIRE_AUTHORS = False
+
+def tokenize(s, remove_whitespace=False):
+
+    s.replace('&apos;', "'")
+    # Remove non-alphanumeric characters
+    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+
+    if remove_whitespace:
+        s = ''.join(s.split())
+
+    # Encode as dumb ASCII (TODO: this is horrible)
+    return s.encode('ascii', 'replace').replace(b'?', b'')
+
+def check_authors(left, right):
+    """
+    Intended to check GROBID extracted authors (right) against "known good"
+    (but maybe not perfect) Crossref metadata authors ("left").
+    """
+    if not left and not right:
+        return bool(not REQUIRE_AUTHORS)
+    if len(left) != len(right):
+        return False
+    right_all = tokenize(" ".join(right))
+    for i in range(len(left)):
+        l = left[i].lower().replace('jr.', '').split()
+        if not l:
+            return False
+        l = tokenize(l[-1])
+        if len(l) <= 1:
+            # weird author name (single char)
+            return False
+        if l not in right_all:
+            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+            return False
+    return True
+
+def test_check_authors():
+    assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
+    assert not check_authors([], ['one'])
+    assert check_authors(['one'], ['one'])
+    assert check_authors(['one two'], ['One Two'])
+    assert check_authors(['two'], ['One Two'])
+    assert check_authors(['two'], ['two, one'])
+    assert check_authors(['mago'], ['Mr. Magoo'])
+    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
+    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
+# Rows are (score, left, right)
+def process_group(rows):
+
+    # first pass reduces size of list and generates a linkage graph
+    filtered = list()
+    for row in rows:
+        score = int(row[0])
+        if score < SCORE_THRESHOLD:
+            continue
+        left = json.loads(row[1])
+        right = json.loads(row[2])
+        # authors must roughly match
+        if not check_authors(left['authors'], right['authors']):
+            continue
+        # years must match (if defined)
+        if left['year'] and right['year'] and left['year'] != right['year']:
+            continue
+        filtered.append((left, right))
+
+    if not filtered:
+        return
+
+    # second pass finds a connected graph and returns that
+    releases = dict()
+    group_ids = set()
+    for row in filtered[1:]:
+        (left, right) = row
+        l_id = left['fatcat_release']
+        r_id = right['fatcat_release']
+        releases[l_id] = left
+        releases[r_id] = right
+        if not group_ids:
+            group_ids.add(l_id)
+            group_ids.add(r_id)
+            continue
+        if l_id in group_ids or r_id in group_ids:
+            group_ids.add(l_id)
+            group_ids.add(r_id)
+            continue
+
+    if not group_ids:
+        return
+
+    print(json.dumps([releases[ident] for ident in group_ids]))
+
+def run():
+
+    last_slug = None
+    lines = []
+
+    # group lines by slug, and process in batches
+    for line in sys.stdin:
+        line = line.strip().split('\t')
+        assert len(line) == 4
+        slug = line[0]
+        if last_slug and slug != last_slug and lines:
+            if len(lines) <= MAX_SLUG_LINES:
+                process_group(lines)
+            lines = []
+        last_slug = slug
+        lines.append(line[1:])
+
+    # catch any remaining
+    if lines:
+        process_group(lines)
+
+if __name__=='__main__':
+    run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
new file mode 100755
index 0000000..3654b87
--- /dev/null
+++ b/python/scripts/filter_scored_matches.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Filters an input stream of sorted "matchcrossref" scalding job, and outputs
+"good enough" matches to be inserted to fatcat.
+
+Currently works on DOI numbers. Filters for a high enough string match (doesn't
+re-do title match), and checks author lists. Filters out slugs with too many
+matches, and outputs one-line-per-sha1 (aka, file).
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+
+# out of 1000
+score_threshold = 900
+
+max_slug_lines = 10
+
+require_authors = 1
+
+
+def tokenize(s, remove_whitespace=False):
+
+    s.replace('&apos;', "'")
+    # Remove non-alphanumeric characters
+    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+
+    if remove_whitespace:
+        s = ''.join(s.split())
+
+    # Encode as dumb ASCII (TODO: this is horrible)
+    return s.encode('ascii', 'replace').replace(b'?', b'')
+
+def check_authors(left, right):
+    """
+    Intended to check GROBID extracted authors (right) against "known good"
+    (but maybe not perfect) Crossref metadata authors ("left").
+    """
+    if not left:
+        return False
+    if len(left) > len(right):
+        return False
+    right_all = tokenize(" ".join(right))
+    for i in range(len(left)):
+        l = left[i].lower().replace('jr.', '').split()
+        if not l:
+            return False
+        l = tokenize(l[-1])
+        if len(l) <= 1:
+            # weird author name (single char)
+            return False
+        if l not in right_all:
+            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+            return False
+    return True
+
+def test_check_authors():
+    assert not check_authors([], [])
+    assert not check_authors([], ['one'])
+    assert check_authors(['one'], ['one'])
+    assert check_authors(['one two'], ['One Two'])
+    assert check_authors(['two'], ['One Two'])
+    assert check_authors(['two'], ['two, one'])
+    assert check_authors(['mago'], ['Mr. Magoo'])
+    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
+    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
+# Rows are (score, grobid, crossref)
+def process_group(rows):
+    if len(rows) > max_slug_lines:
+        return
+    keepers = dict()
+    for row in rows:
+        score = int(row[0])
+        if score < score_threshold:
+            continue
+        grobid = json.loads(row[1])
+        crossref = json.loads(row[2])
+        if not check_authors(crossref['authors'], grobid['authors']):
+            #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+            continue
+        else:
+            #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+            pass
+        sha1 = grobid['sha1']
+        doi = crossref['doi'].lower()
+        l = keepers.get(sha1, list())
+        l.append(doi)
+        keepers[sha1] = l
+    for sha1, doi_list in keepers.items():
+        print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
+def run():
+
+    last_slug = None
+    lines = []
+
+    # group lines by slug, and process in batches
+    for line in sys.stdin:
+        line = line.strip().split('\t')
+        assert len(line) == 4
+        slug = line[0]
+        if last_slug and slug != last_slug and lines:
+            process_group(lines)
+            lines = []
+        last_slug = slug
+        lines.append(line[1:])
+
+    # catch any remaining
+    if lines:
+        process_group(lines)
+
+if __name__=='__main__':
+    run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
new file mode 100755
index 0000000..3d2e14c
--- /dev/null
+++ b/python/scripts/import_grobid_metadata.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import datetime
+
+MAX_ABSTRACT_BYTES=4096
+
+def parse_grobid_json(obj):
+
+    if not obj.get('title'):
+        return None
+
+    extra = dict()
+
+    if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
+        abobj = dict(
+            mimetype="text/plain",
+            language=None,
+            content=obj.get('abstract').strip())
+        abstracts = [abobj]
+    else:
+        abstracts = None
+
+    contribs = []
+    for a in obj.get('authors', []):
+        c = dict(raw_name=a, role="author")
+        contribs.append(c)
+
+    refs = []
+    for raw in obj.get('citations', []):
+        extra = dict()
+        ref = dict()
+        ref['key'] = raw.get('id')
+        if raw.get('title'):
+            ref['title'] = raw['title'].strip()
+        if raw.get('date'):
+            try:
+                year = int(raw['date'].strip()[:4])
+                ref['year'] = year
+            except:
+                pass
+        for key in ('volume', 'url', 'issue', 'publisher'):
+            if raw.get(key):
+                extra[key] = raw[key].strip()
+        if raw.get('authors'):
+            extra['authors'] = [a['name'] for a in raw['authors']]
+        if extra:
+            extra = dict(grobid=extra)
+        else:
+            extra = None
+        ref['extra'] = extra
+        refs.append(ref)
+
+    release_type = "journal-article"
+    release_date = None
+    if obj.get('date'):
+        # TODO: only returns year, ever? how to handle?
+        release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+
+    if obj.get('doi'):
+        extra['doi'] = obj['doi']
+    if obj['journal'].get('name'):
+        extra['container_name'] = obj['journal']['name']
+
+    extra['is_longtail_oa'] = True
+
+    # TODO: ISSN/eISSN handling? or just journal name lookup?
+
+    if extra:
+        extra = dict(grobid=extra)
+    else:
+        extra = None
+
+    return dict(
+        title=obj['title'].strip(),
+        contribs=contribs,
+        publisher=obj['journal'].get('publisher'),
+        volume=obj['journal'].get('volume'),
+        issue=obj['journal'].get('issue'),
+        abstracts=abstracts,
+        release_type=release_type,
+        release_date=release_date,
+        extra=extra)
+
+def run():
+    for line in sys.stdin:
+        obj = json.loads(line)
+        out = parse_grobid_json(obj)
+        if out:
+            print(out)
+
+if __name__=="__main__":
+    run()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
new file mode 100755
index 0000000..35cee5b
--- /dev/null
+++ b/python/scripts/manifest_converter.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""
+Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of
+"match" JSON objects which can be imported into fatcat with matched_import.py
+
+This was used to convert this manifest:
+
+    https://archive.org/details/ia_papers_manifest_2018-01-25/
+
+to JSON format for fast fatcat importing.
+"""
+
+import sys
+import json
+import sqlite3
+
+# iterate over rows in files metadata...
+# 1. select all identified DOIs
+#   => filter based on count
+# 2. select all file metadata
+# 3. output object
+
+def or_none(s):
+    if s is None:
+        return None
+    elif type(s) == str and ((not s) or s == "\\N" or s == "-"):
+        return None
+    return s
+
+def process_db(db_path):
+
+    db = sqlite3.connect(db_path)
+
+    for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
+        sha1 = row[0]
+        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
+        dois = [d[0] for d in dois]
+        if not dois:
+            continue
+        urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
+        if not urls:
+            continue
+        cdx = [dict(url=row[0], dt=row[1]) for row in urls]
+        obj = dict(
+            sha1=sha1,
+            mimetype=or_none(row[1]),
+            size=(or_none(row[2]) and int(row[2])),
+            md5=or_none(row[3]),
+            dois=dois,
+            cdx=cdx,
+        )
+        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
+        print(json.dumps(obj))
+
+if __name__=="__main__":
+    process_db(sys.argv[1])
-- 
cgit v1.2.3