fmt (black): fatcat_tools/

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
commit: 31d1a6a713d177990609767d508209ced19ca396 (patch)
tree: a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers
parent: 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download: fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
22 files changed, 2578 insertions, 2115 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 2b0ff7ec..ae4f9049 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,9 +1,9 @@
-
 import fatcat_openapi_client
 
 from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
 
-ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL'
+ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
+
 
 class ArabesqueMatchImporter(EntityImporter):
     """
@@ -38,17 +38,17 @@ class ArabesqueMatchImporter(EntityImporter):
 
     def __init__(self, api, extid_type, require_grobid=True, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist"
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
-        if kwargs.get('crawl_id'):
-            eg_extra['crawl_id'] = kwargs.get('crawl_id')
-        kwargs['do_updates'] = kwargs.get("do_updates", False)
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
-        assert extid_type in ('doi', 'pmcid', 'pmid')
+        eg_desc = (
+            kwargs.get("editgroup_description", None)
+            or "Match web crawl files to releases based on identifier/URL seedlist"
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArabesqueMatchImporter")
+        if kwargs.get("crawl_id"):
+            eg_extra["crawl_id"] = kwargs.get("crawl_id")
+        kwargs["do_updates"] = kwargs.get("do_updates", False)
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
+        assert extid_type in ("doi", "pmcid", "pmid")
         self.extid_type = extid_type
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         assert self.default_link_rel
@@ -60,33 +60,35 @@ class ArabesqueMatchImporter(EntityImporter):
             print("NOT checking GROBID status column")
 
     def want(self, row):
-        if self.require_grobid and not row['postproc_status'] == "200":
+        if self.require_grobid and not row["postproc_status"] == "200":
             return False
-        if (bool(row['hit']) is True
-                and row['final_sha1']
-                and row['final_timestamp']
-                and row['final_timestamp'] != "-"
-                and len(row['final_timestamp']) == 14
-                and row['final_mimetype']
-                and bool(row['hit']) is True
-                and row['identifier']):
+        if (
+            bool(row["hit"]) is True
+            and row["final_sha1"]
+            and row["final_timestamp"]
+            and row["final_timestamp"] != "-"
+            and len(row["final_timestamp"]) == 14
+            and row["final_mimetype"]
+            and bool(row["hit"]) is True
+            and row["identifier"]
+        ):
             return True
         else:
             return False
 
     def parse_record(self, row):
 
-        extid = row['identifier'].strip()
+        extid = row["identifier"].strip()
 
         # check/cleanup DOI
-        if self.extid_type == 'doi':
+        if self.extid_type == "doi":
             extid = extid.lower()
-            extid.replace('http://doi.org/', '')
-            extid.replace('https://doi.org/', '')
-            if extid.startswith('doi:'):
+            extid.replace("http://doi.org/", "")
+            extid.replace("https://doi.org/", "")
+            if extid.startswith("doi:"):
                 extid = extid[4:]
-            if not extid.startswith('10.'):
-                self.counts['skip-extid-invalid']
+            if not extid.startswith("10."):
+                self.counts["skip-extid-invalid"]
                 return None
 
         # lookup extid
@@ -95,35 +97,35 @@ class ArabesqueMatchImporter(EntityImporter):
         except fatcat_openapi_client.rest.ApiException as err:
             if err.status == 404:
                 # bail on 404 (release not in DB)
-                self.counts['skip-extid-not-found'] += 1
+                self.counts["skip-extid-not-found"] += 1
                 return None
             elif err.status == 400:
-                self.counts['skip-extid-invalid'] += 1
+                self.counts["skip-extid-invalid"] += 1
                 return None
             else:
                 raise err
 
-        url = make_rel_url(row['final_url'], self.default_link_rel)
+        url = make_rel_url(row["final_url"], self.default_link_rel)
         if not url:
-            self.counts['skip-url'] += 1
+            self.counts["skip-url"] += 1
             return None
-        if not row['final_timestamp']:
-            self.counts['skip-missing-timestamp'] += 1
+        if not row["final_timestamp"]:
+            self.counts["skip-missing-timestamp"] += 1
             return None
         wayback = "https://web.archive.org/web/{}/{}".format(
-            row['final_timestamp'],
-            row['final_url'])
+            row["final_timestamp"], row["final_url"]
+        )
         urls = [url, ("webarchive", wayback)]
 
         urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
 
         if len(urls) > SANE_MAX_URLS:
-            self.counts['skip-too-many-url'] += 1
+            self.counts["skip-too-many-url"] += 1
             return None
 
         fe = fatcat_openapi_client.FileEntity(
-            sha1=b32_hex(row['final_sha1']),
-            mimetype=row['final_mimetype'] or self.default_mimetype,
+            sha1=b32_hex(row["final_sha1"]),
+            mimetype=row["final_mimetype"] or self.default_mimetype,
             release_ids=[re.ident],
             urls=urls,
         )
@@ -143,15 +145,15 @@ class ArabesqueMatchImporter(EntityImporter):
 
         if (fe.release_ids[0] in existing.release_ids) and existing.urls:
             # TODO: could still, in theory update with the new URL?
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         if not self.do_updates:
-            self.counts['skip-update-disabled'] += 1
+            self.counts["skip-update-disabled"] += 1
             return False
 
         if existing.ident in [e.ident for e in self._edits_inflight]:
-            self.counts['skip-update-inflight'] += 1
+            self.counts["skip-update-inflight"] += 1
             return False
 
         # TODO: this code path never gets hit because of the check above
@@ -159,28 +161,33 @@ class ArabesqueMatchImporter(EntityImporter):
             existing_urls = set([u.url for u in existing.urls])
             new_urls = set([u.url for u in fe.urls])
             if existing_urls.issuperset(new_urls):
-                self.counts['skip-update-nothing-new'] += 1
+                self.counts["skip-update-nothing-new"] += 1
                 return False
 
         # merge the existing into this one and update
         existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
-        existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+        existing.urls = [
+            fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls
+        ]
         if len(existing.urls) > SANE_MAX_URLS:
-            self.counts['skip-update-too-many-url'] += 1
+            self.counts["skip-update-too-many-url"] += 1
             return None
         existing.release_ids = list(set(fe.release_ids + existing.release_ids))
         if len(existing.release_ids) > SANE_MAX_RELEASES:
-            self.counts['skip-update-too-many-url'] += 1
+            self.counts["skip-update-too-many-url"] += 1
             return None
         existing.mimetype = existing.mimetype or fe.mimetype
         edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
         self._edits_inflight.append(edit)
-        self.counts['update'] += 1
+        self.counts["update"] += 1
         return False
 
     def insert_batch(self, batch):
-        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_file_auto_batch(
+            fatcat_openapi_client.FileAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index fc429fb0..7a689ed2 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -1,4 +1,3 @@
-
 import datetime
 import json
 import re
@@ -13,6 +12,7 @@ from .crossref import lookup_license_slug
 
 latex2text = LatexNodes2Text()
 
+
 def latex_to_text(raw):
     try:
         return latex2text.latex_to_text(raw).strip()
@@ -21,13 +21,14 @@ def latex_to_text(raw):
     except IndexError:
         return raw.strip()
 
+
 def parse_arxiv_authors(raw):
     if not raw:
         return []
-    raw = raw.replace('*', '')
-    if '(' in raw:
-        raw = re.sub(r'\(.*\)', '', raw)
-    authors = raw.split(', ')
+    raw = raw.replace("*", "")
+    if "(" in raw:
+        raw = re.sub(r"\(.*\)", "", raw)
+    authors = raw.split(", ")
     if authors:
         last = authors[-1].split(" and ")
         if len(last) == 2:
@@ -39,9 +40,12 @@ def parse_arxiv_authors(raw):
     authors = [a for a in authors if a]
     return authors
 
+
 def test_parse_arxiv_authors():
 
-    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+    assert parse_arxiv_authors(
+        "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an"
+    ) == [
         "Raphael Chetrite",
         "Shamik Gupta",
         "Izaak Neri",
@@ -63,7 +67,9 @@ def test_parse_arxiv_authors():
         "Raphael Chetrite Shamik Gupta",
     ]
 
-    assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V.  James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [
+    assert parse_arxiv_authors(
+        "B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V.  James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)"
+    ) == [
         "B. P. Lanyon",
         "T. J. Weinhold",
         "N. K. Langford",
@@ -84,17 +90,21 @@ class ArxivRawImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter')
+        eg_desc = kwargs.get(
+            "editgroup_description",
+            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed",
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArxivRawImporter")
         # lower batch size, because multiple versions per entry (guessing 2-3 on average?)
-        batch_size = kwargs.get('edit_batch_size', 50)
-        super().__init__(api,
+        batch_size = kwargs.get("edit_batch_size", 50)
+        super().__init__(
+            api,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
             batch_size=batch_size,
-            **kwargs)
+            **kwargs
+        )
         self._test_override = False
 
     def parse_record(self, record):
@@ -114,53 +124,56 @@ class ArxivRawImporter(EntityImporter):
         doi = None
         if metadata.doi and metadata.doi.string:
             doi = metadata.doi.string.lower().split()[0].strip()
-            if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):
+            if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
                 sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                 doi = None
-        title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
-        authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' '))
-        contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]
-
-        lang = "en"     # the vast majority in english
+        title = latex_to_text(metadata.title.get_text().replace("\n", " "))
+        authors = parse_arxiv_authors(metadata.authors.get_text().replace("\n", " "))
+        contribs = [
+            fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author")
+            for i, a in enumerate(authors)
+        ]
+
+        lang = "en"  # the vast majority in english
         if metadata.comments and metadata.comments.get_text():
-            comments = metadata.comments.get_text().replace('\n', ' ').strip()
-            extra_arxiv['comments'] = comments
-            if 'in french' in comments.lower():
-                lang = 'fr'
-            elif 'in spanish' in comments.lower():
-                lang = 'es'
-            elif 'in portuguese' in comments.lower():
-                lang = 'pt'
-            elif 'in hindi' in comments.lower():
-                lang = 'hi'
-            elif 'in japanese' in comments.lower():
-                lang = 'ja'
-            elif 'in german' in comments.lower():
-                lang = 'de'
-            elif 'simplified chinese' in comments.lower():
-                lang = 'zh'
-            elif 'in russian' in comments.lower():
-                lang = 'ru'
+            comments = metadata.comments.get_text().replace("\n", " ").strip()
+            extra_arxiv["comments"] = comments
+            if "in french" in comments.lower():
+                lang = "fr"
+            elif "in spanish" in comments.lower():
+                lang = "es"
+            elif "in portuguese" in comments.lower():
+                lang = "pt"
+            elif "in hindi" in comments.lower():
+                lang = "hi"
+            elif "in japanese" in comments.lower():
+                lang = "ja"
+            elif "in german" in comments.lower():
+                lang = "de"
+            elif "simplified chinese" in comments.lower():
+                lang = "zh"
+            elif "in russian" in comments.lower():
+                lang = "ru"
             # more languages?
 
         number = None
-        if metadata.find('journal-ref') and metadata.find('journal-ref').get_text():
-            journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip()
-            extra_arxiv['journal_ref'] = journal_ref
+        if metadata.find("journal-ref") and metadata.find("journal-ref").get_text():
+            journal_ref = metadata.find("journal-ref").get_text().replace("\n", " ").strip()
+            extra_arxiv["journal_ref"] = journal_ref
             if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
                 release_type = "paper-conference"
-        if metadata.find('report-no') and metadata.find('report-no').string:
-            number = metadata.find('report-no').string.strip()
+        if metadata.find("report-no") and metadata.find("report-no").string:
+            number = metadata.find("report-no").string.strip()
             # at least some people plop extra metadata in here. hrmf!
-            if 'ISSN ' in number or 'ISBN ' in number or len(number.split()) > 2:
-                extra_arxiv['report-no'] = number
+            if "ISSN " in number or "ISBN " in number or len(number.split()) > 2:
+                extra_arxiv["report-no"] = number
                 number = None
             else:
                 release_type = "report"
-        if metadata.find('acm-class') and metadata.find('acm-class').string:
-            extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip()
+        if metadata.find("acm-class") and metadata.find("acm-class").string:
+            extra_arxiv["acm_class"] = metadata.find("acm-class").string.strip()
         if metadata.categories and metadata.categories.get_text():
-            extra_arxiv['categories'] = metadata.categories.get_text().split()
+            extra_arxiv["categories"] = metadata.categories.get_text().split()
         license_slug = None
         if metadata.license and metadata.license.get_text():
             license_slug = lookup_license_slug(metadata.license.get_text())
@@ -170,21 +183,29 @@ class ArxivRawImporter(EntityImporter):
             abstracts = []
             abst = metadata.abstract.get_text().strip()
             orig = None
-            if '-----' in abst:
-                both = abst.split('-----')
+            if "-----" in abst:
+                both = abst.split("-----")
                 abst = both[0].strip()
                 orig = both[1].strip()
-            if '$' in abst or '{' in abst:
+            if "$" in abst or "{" in abst:
                 mime = "application/x-latex"
                 abst_plain = latex_to_text(abst)
-                abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en"))
+                abstracts.append(
+                    fatcat_openapi_client.ReleaseAbstract(
+                        content=abst_plain, mimetype="text/plain", lang="en"
+                    )
+                )
             else:
                 mime = "text/plain"
-            abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en"))
+            abstracts.append(
+                fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")
+            )
             if orig:
-                abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime))
+                abstracts.append(
+                    fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)
+                )
                 # indicates that fulltext probably isn't english either
-                if lang == 'en':
+                if lang == "en":
                     lang = None
 
         # extra:
@@ -195,39 +216,43 @@ class ArxivRawImporter(EntityImporter):
         #   container_name
         #   group-title
         #   arxiv: comments, categories, etc
-        extra_arxiv['base_id'] = base_id
-        extra['superceded'] = True
-        extra['arxiv'] = extra_arxiv
+        extra_arxiv["base_id"] = base_id
+        extra["superceded"] = True
+        extra["arxiv"] = extra_arxiv
 
         versions = []
-        for version in metadata.find_all('version'):
-            arxiv_id = base_id + version['version']
+        for version in metadata.find_all("version"):
+            arxiv_id = base_id + version["version"]
             release_date = version.date.string.strip()
-            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
+            release_date = datetime.datetime.strptime(
+                release_date, "%a, %d %b %Y %H:%M:%S %Z"
+            ).date()
             # TODO: source_type?
-            versions.append(fatcat_openapi_client.ReleaseEntity(
-                work_id=None,
-                title=title,
-                #original_title
-                version=version['version'],
-                release_type=release_type,
-                release_stage='submitted',
-                release_date=release_date.isoformat(),
-                release_year=release_date.year,
-                ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                    arxiv=arxiv_id,
-                ),
-                number=number,
-                language=lang,
-                license_slug=license_slug,
-                abstracts=abstracts,
-                contribs=contribs,
-                extra=extra.copy(),
-            ))
+            versions.append(
+                fatcat_openapi_client.ReleaseEntity(
+                    work_id=None,
+                    title=title,
+                    # original_title
+                    version=version["version"],
+                    release_type=release_type,
+                    release_stage="submitted",
+                    release_date=release_date.isoformat(),
+                    release_year=release_date.year,
+                    ext_ids=fatcat_openapi_client.ReleaseExtIds(
+                        arxiv=arxiv_id,
+                    ),
+                    number=number,
+                    language=lang,
+                    license_slug=license_slug,
+                    abstracts=abstracts,
+                    contribs=contribs,
+                    extra=extra.copy(),
+                )
+            )
         # TODO: assert that versions are actually in order?
         assert versions
 
-        versions[-1].extra.pop('superceded')
+        versions[-1].extra.pop("superceded")
 
         # only apply DOI to most recent version (HACK)
         if doi:
@@ -306,7 +331,7 @@ class ArxivRawImporter(EntityImporter):
         for v in versions:
             if v._existing_work_id:
                 if not v._updated:
-                    self.counts['exists'] += 1
+                    self.counts["exists"] += 1
                 continue
             if not any_work_id and last_edit:
                 # fetch the last inserted release from this group
@@ -315,7 +340,7 @@ class ArxivRawImporter(EntityImporter):
                 any_work_id = r.work_id
             v.work_id = any_work_id
             last_edit = self.api.create_release(self.get_editgroup_id(), v)
-            self.counts['insert'] += 1
+            self.counts["insert"] += 1
 
         return False
 
@@ -323,12 +348,15 @@ class ArxivRawImporter(EntityImporter):
         # there is no batch/bezerk mode for arxiv importer, except for testing
         if self._test_override:
             for batch in batch_batch:
-                self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-                    editgroup=fatcat_openapi_client.Editgroup(
-                        description=self.editgroup_description,
-                        extra=self.editgroup_extra),
-                    entity_list=batch))
-                self.counts['insert'] += len(batch) - 1
+                self.api.create_release_auto_batch(
+                    fatcat_openapi_client.ReleaseAutoBatch(
+                        editgroup=fatcat_openapi_client.Editgroup(
+                            description=self.editgroup_description, extra=self.editgroup_extra
+                        ),
+                        entity_list=batch,
+                    )
+                )
+                self.counts["insert"] += len(batch) - 1
         else:
             raise NotImplementedError()
 
@@ -341,9 +369,9 @@ class ArxivRawImporter(EntityImporter):
         for article in soup.find_all("record"):
             resp = self.parse_record(article)
             print(json.dumps(resp))
-            #sys.exit(-1)
+            # sys.exit(-1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = ArxivRawImporter(None)
     parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index 0340f6a3..e9de42fc 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -34,15 +34,15 @@ def single_file(prefix, path):
         hashlib.sha1(),
         hashlib.sha256(),
     ]
-    with open(full, 'rb') as fp:
+    with open(full, "rb") as fp:
         while True:
-            data = fp.read(2**20)
+            data = fp.read(2 ** 20)
             if not data:
                 break
             for h in hashes:
                 h.update(data)
     mime = magic.Magic(mime=True).from_file(full)
-    if mime == 'application/octet-stream':
+    if mime == "application/octet-stream":
         # magic apparently isn't that great; try using filename as well
         guess = mimetypes.guess_type(full)[0]
         if guess:
@@ -54,9 +54,11 @@ def single_file(prefix, path):
         md5=hashes[0].hexdigest(),
         sha1=hashes[1].hexdigest(),
         sha256=hashes[2].hexdigest(),
-        extra=dict(mimetype=mime))
+        extra=dict(mimetype=mime),
+    )
     return fsf
 
+
 def make_manifest(base_dir):
     manifest = []
     for root, dirs, files in os.walk(base_dir):
@@ -70,47 +72,49 @@ def cdl_dash_release(meta, extra=None):
     if not extra:
         extra = dict()
 
-    assert meta['identifier']['type'] == 'DOI'
-    doi = meta['identifier']['value'].lower()
-    assert doi.startswith('10.')
+    assert meta["identifier"]["type"] == "DOI"
+    doi = meta["identifier"]["value"].lower()
+    assert doi.startswith("10.")
 
     ark_id = None
-    for extid in meta.get('alternativeIdentifiers', []):
-        if extid['value'].startswith('ark:'):
-            ark_id = extid['value']
+    for extid in meta.get("alternativeIdentifiers", []):
+        if extid["value"].startswith("ark:"):
+            ark_id = extid["value"]
     assert ark_id
 
-    license_slug = lookup_license_slug(meta['rights']['uri'])
+    license_slug = lookup_license_slug(meta["rights"]["uri"])
 
     abstracts = []
-    for desc in meta['descriptions']:
-        if desc['type'] == "abstract":
-            abstracts.append(ReleaseAbstract(
-                mimetype="text/html",
-                content=clean(desc['value'])))
-            #print(abstracts)
+    for desc in meta["descriptions"]:
+        if desc["type"] == "abstract":
+            abstracts.append(
+                ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
+            )
+            # print(abstracts)
     if not abstracts:
         abstracts = None
 
     contribs = []
-    for creator in meta['creator']:
-        contribs.append(ReleaseContrib(
-            given_name=creator['given'],
-            surname=creator['family'],
-            # sorry everybody
-            raw_name="{} {}".format(creator['given'], creator['family']),
-            raw_affiliation=creator.get('affiliation'),
-            role="author", # presumably, for these datasets?
-        ))
+    for creator in meta["creator"]:
+        contribs.append(
+            ReleaseContrib(
+                given_name=creator["given"],
+                surname=creator["family"],
+                # sorry everybody
+                raw_name="{} {}".format(creator["given"], creator["family"]),
+                raw_affiliation=creator.get("affiliation"),
+                role="author",  # presumably, for these datasets?
+            )
+        )
 
     r = ReleaseEntity(
         ext_ids=ReleaseExtIds(
             doi=doi,
             ark=ark_id,
         ),
-        title=clean(meta['title'], force_xml=True),
-        publisher=clean(meta['publisher']),
-        release_year=int(meta['publicationYear']),
+        title=clean(meta["title"], force_xml=True),
+        publisher=clean(meta["publisher"]),
+        release_year=int(meta["publicationYear"]),
         release_type="dataset",
         license_slug=license_slug,
         contribs=contribs,
@@ -119,66 +123,66 @@ def cdl_dash_release(meta, extra=None):
     )
     return r
 
+
 def make_release_fileset(dat_path):
 
-    if dat_path.endswith('/'):
+    if dat_path.endswith("/"):
         dat_path = dat_path[:-1]
     dat_discovery = dat_path
     extra = dict()
     assert len(dat_discovery) == 64
 
-    with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp:
+    with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
         meta_dict = json.loads(fp.read())
 
     release = cdl_dash_release(meta_dict)
-    ark_id = release.extra['ark_id']
+    ark_id = release.extra["ark_id"]
 
     dash_version = None
     # really crude XML parse-out
-    with open(dat_path + "/stash-wrapper.xml", 'r') as fp:
+    with open(dat_path + "/stash-wrapper.xml", "r") as fp:
         for line in fp:
             line = line.strip()
             if line.startswith("<st:version_number>"):
-                dash_version = int(line[19:].split('<')[0])
+                dash_version = int(line[19:].split("<")[0])
     assert dash_version is not None
-    extra['cdl_dash'] = dict(version=dash_version)
-    release.extra['cdl_dash'] = dict(version=dash_version)
+    extra["cdl_dash"] = dict(version=dash_version)
+    release.extra["cdl_dash"] = dict(version=dash_version)
 
     manifest = make_manifest(dat_path + "/files/")
 
     bundle_url = dict(
         url="https://merritt.cdlib.org/u/{}/{}".format(
-            urllib.parse.quote(ark_id, safe=''),
-            dash_version),
-        rel="repo-bundle")
+            urllib.parse.quote(ark_id, safe=""), dash_version
+        ),
+        rel="repo-bundle",
+    )
     repo_url = dict(
         url="https://merritt.cdlib.org/d/{}/{}/".format(
-            urllib.parse.quote(ark_id, safe=''),
-            dash_version),
-        rel="repo")
-    dat_url = dict(
-        url="dat://{}/files/".format(dat_discovery),
-        rel="dweb")
+            urllib.parse.quote(ark_id, safe=""), dash_version
+        ),
+        rel="repo",
+    )
+    dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
     fs = FilesetEntity(
-        urls=[bundle_url, repo_url, dat_url],
-        release_ids=None,
-        manifest=manifest,
-        extra=extra)
+        urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
+    )
     return (release, fs)
 
+
 def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
 
-    git_rev = subprocess.check_output(
-        ["git", "describe", "--always"]).strip().decode('utf-8')
+    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
 
     (release, fileset) = make_release_fileset(dat_path)
 
     if not editgroup_id:
-        eg = api.create_editgroup(Editgroup(
-            description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
-            extra=dict(
-                git_rev=git_rev,
-                agent="fatcat_tools.auto_cdl_dash_dat")))
+        eg = api.create_editgroup(
+            Editgroup(
+                description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
+                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
+            )
+        )
         editgroup_id = eg.editgroup_id
 
     if not release_id and release.ext_ids.doi:
@@ -201,6 +205,7 @@ def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
     fileset = api.get_fileset(edit.ident)
     return (editgroup_id, release, fileset)
 
-if __name__=='__main__':
+
+if __name__ == "__main__":
     # pass this a discovery key that has been cloned to the local directory
     print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 0b634e73..8d2a89b6 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -1,4 +1,3 @@
-
 import fatcat_openapi_client
 
 from .common import EntityImporter, clean
@@ -15,20 +14,19 @@ class ChoculaImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of container-level metadata from Chocula tool.")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = kwargs.get(
+            "editgroup_description",
+            "Automated import of container-level metadata from Chocula tool.",
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter")
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, raw_record):
-        if not raw_record.get('ident') and not raw_record.get('_known_issnl'):
-            self.counts['skip-unknown-new-issnl'] += 1
+        if not raw_record.get("ident") and not raw_record.get("_known_issnl"):
+            self.counts["skip-unknown-new-issnl"] += 1
             return False
-        if raw_record.get('issnl') and raw_record.get('name'):
+        if raw_record.get("issnl") and raw_record.get("name"):
             return True
         return False
 
@@ -39,42 +37,55 @@ class ChoculaImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
-        name = clean(row.get('name'))
+        name = clean(row.get("name"))
         if not name:
             # Name is required (by schema)
             return None
 
         name = name.strip()
 
-        if name.endswith(',  Proceedings of the'):
-            name = "Proceedings of the " + name.split(',')[0]
+        if name.endswith(",  Proceedings of the"):
+            name = "Proceedings of the " + name.split(",")[0]
 
-        if name.endswith('.'):
+        if name.endswith("."):
             name = name[:-1]
 
         extra = dict()
-        for k in ('urls', 'webarchive_urls', 'country',
-                  'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages',
-                  'ia', 'scielo', 'kbart', 'publisher_type', 'platform'):
-            if row['extra'].get(k):
-                extra[k] = row['extra'][k]
+        for k in (
+            "urls",
+            "webarchive_urls",
+            "country",
+            "sherpa_romeo",
+            "ezb",
+            "szczepanski",
+            "doaj",
+            "languages",
+            "ia",
+            "scielo",
+            "kbart",
+            "publisher_type",
+            "platform",
+        ):
+            if row["extra"].get(k):
+                extra[k] = row["extra"][k]
 
         container_type = None
-        if 'proceedings' in name.lower():
-            container_type = 'proceedings'
-        elif 'journal ' in name.lower():
-            container_type = 'journal'
+        if "proceedings" in name.lower():
+            container_type = "proceedings"
+        elif "journal " in name.lower():
+            container_type = "journal"
 
         ce = fatcat_openapi_client.ContainerEntity(
-            issnl=row['issnl'],
-            issnp=row['extra'].get('issnp'),
-            issne=row['extra'].get('issne'),
-            ident=row['ident'],
+            issnl=row["issnl"],
+            issnp=row["extra"].get("issnp"),
+            issne=row["extra"].get("issne"),
+            ident=row["ident"],
             name=name,
             container_type=container_type,
-            publisher=clean(row.get('publisher')),
-            wikidata_qid=row.get('wikidata_qid'),
-            extra=extra)
+            publisher=clean(row.get("publisher")),
+            wikidata_qid=row.get("wikidata_qid"),
+            extra=extra,
+        )
         return ce
 
     def try_update(self, ce):
@@ -86,12 +97,12 @@ class ChoculaImporter(EntityImporter):
             except fatcat_openapi_client.rest.ApiException as err:
                 if err.status != 404:
                     raise err
-                self.counts['exists'] += 1
-                self.counts['exists-not-found'] += 1
+                self.counts["exists"] += 1
+                self.counts["exists-not-found"] += 1
                 return False
-            if existing.state != 'active':
-                self.counts['exists'] += 1
-                self.counts['exists-inactive'] += 1
+            if existing.state != "active":
+                self.counts["exists"] += 1
+                self.counts["exists-inactive"] += 1
                 return False
 
         if not existing:
@@ -102,8 +113,8 @@ class ChoculaImporter(EntityImporter):
                 if err.status != 404:
                     raise err
             if existing:
-                self.counts['exists'] += 1
-                self.counts['exists-by-issnl'] += 1
+                self.counts["exists"] += 1
+                self.counts["exists-by-issnl"] += 1
                 return False
             # doesn't exist, always create
             return True
@@ -111,18 +122,22 @@ class ChoculaImporter(EntityImporter):
         # decide whether to update
         do_update = False
         if not self.do_updates:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
         if not existing.extra:
             existing.extra = dict()
-        if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
+        if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set(
+            existing.extra.get("urls", [])
+        ):
             do_update = True
-        if ce.extra.get('webarchive_urls') and set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):
+        if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set(
+            existing.extra.get("webarchive_urls", [])
+        ):
             do_update = True
-        for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'):
+        for k in ("ezb", "szczepanski", "publisher_type", "platform"):
             if ce.extra.get(k) and not existing.extra.get(k):
                 do_update = True
-        for k in ('kbart', 'ia', 'doaj'):
+        for k in ("kbart", "ia", "doaj"):
             # always update these fields if not equal (chocula override)
             if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k):
                 do_update = True
@@ -137,41 +152,53 @@ class ChoculaImporter(EntityImporter):
             existing.container_type = existing.container_type or ce.container_type
             existing.issne = existing.issne or ce.issne
             existing.issnp = existing.issnp or ce.issnp
-            for k in ('urls', 'webarchive_urls'):
+            for k in ("urls", "webarchive_urls"):
                 # be conservative about URL updates; don't clobber existing URL lists
                 # may want to make this behavior more sophisticated in the
                 # future, or at least a config flag
                 if ce.extra.get(k) and not existing.extra.get(k):
                     existing.extra[k] = ce.extra.get(k, [])
-            for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia',
-                      'scielo', 'kbart', 'publisher_type', 'platform'):
+            for k in (
+                "sherpa_romeo",
+                "ezb",
+                "szczepanski",
+                "doaj",
+                "ia",
+                "scielo",
+                "kbart",
+                "publisher_type",
+                "platform",
+            ):
                 # always update (chocula over-rides)
                 if ce.extra.get(k):
                     existing.extra[k] = ce.extra[k]
-            for k in ('country',):
+            for k in ("country",):
                 # only include if not set (don't clobber human edits)
                 if ce.extra.get(k) and not existing.extra.get(k):
                     existing.extra[k] = ce.extra[k]
-            if ce.extra.get('languages'):
-                if not existing.extra.get('languages'):
-                    existing.extra['languages'] = ce.extra['languages']
-                elif not ce.extra['languages'][0] in existing.extra['languages']:
-                    existing.extra['languages'].append(ce.extra['languages'][0])
+            if ce.extra.get("languages"):
+                if not existing.extra.get("languages"):
+                    existing.extra["languages"] = ce.extra["languages"]
+                elif not ce.extra["languages"][0] in existing.extra["languages"]:
+                    existing.extra["languages"].append(ce.extra["languages"][0])
 
             self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
-            self.counts['update'] += 1
+            self.counts["update"] += 1
             return False
         else:
-            self.counts['exists'] += 1
-            self.counts['exists-skip-update'] += 1
+            self.counts["exists"] += 1
+            self.counts["exists-skip-update"] += 1
             return False
 
         # if we got this far, it's a bug
         raise NotImplementedError
 
     def insert_batch(self, batch):
-        self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_container_auto_batch(
+            fatcat_openapi_client.ContainerAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index e33a2012..2639c85a 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -1,4 +1,3 @@
-
 import csv
 import datetime
 import json
@@ -34,7 +33,6 @@ SANE_MAX_URLS: int = 100
 DOMAIN_REL_MAP: Dict[str, str] = {
     "archive.org": "archive",
     # LOCKSS, Portico, DuraSpace, etc would also be "archive"
-
     "arxiv.org": "repository",
     "babel.hathitrust.org": "repository",
     "cds.cern.ch": "repository",
@@ -53,7 +51,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {
     "zenodo.org": "repository",
     "www.biorxiv.org": "repository",
     "www.medrxiv.org": "repository",
-
     "citeseerx.ist.psu.edu": "aggregator",
     "publisher-connector.core.ac.uk": "aggregator",
     "core.ac.uk": "aggregator",
@@ -62,7 +59,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {
     "pdfs.semanticscholar.org": "aggregator",
     "semanticscholar.org": "aggregator",
     "www.semanticscholar.org": "aggregator",
-
     "academic.oup.com": "publisher",
     "cdn.elifesciences.org": "publisher",
     "cell.com": "publisher",
@@ -86,15 +82,14 @@ DOMAIN_REL_MAP: Dict[str, str] = {
     "ehp.niehs.nih.gov": "publisher",
     "journals.tsu.ru": "publisher",
     "www.cogentoa.com": "publisher",
-
     "www.researchgate.net": "academicsocial",
     "academia.edu": "academicsocial",
-
     "wayback.archive-it.org": "webarchive",
     "web.archive.org": "webarchive",
     "archive.is": "webarchive",
 }
 
+
 def make_rel_url(raw_url: str, default_link_rel: str = "web"):
     # this is where we map specific domains to rel types, and also filter out
     # bad domains, invalid URLs, etc
@@ -105,12 +100,17 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"):
             break
     return (rel, raw_url)
 
+
 def test_make_rel_url():
     assert make_rel_url("http://example.com/thing.pdf")[0] == "web"
     assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans"
-    assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive"
+    assert (
+        make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0]
+        == "webarchive"
+    )
     assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher"
 
+
 class EntityImporter:
     """
     Base class for fatcat entity importers.
@@ -147,23 +147,26 @@ class EntityImporter:
 
     def __init__(self, api, **kwargs):
 
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['git_rev'] = eg_extra.get('git_rev',
-            subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["git_rev"] = eg_extra.get(
+            "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip()
+        ).decode("utf-8")
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityImporter")
 
         self.api = api
-        self.do_updates = bool(kwargs.get('do_updates', True))
-        self.do_fuzzy_match: bool = kwargs.get('do_fuzzy_match', True)
-        self.bezerk_mode: bool = kwargs.get('bezerk_mode', False)
-        self.submit_mode: bool = kwargs.get('submit_mode', False)
-        self.edit_batch_size: int = kwargs.get('edit_batch_size', 100)
-        self.editgroup_description: Optional[str] = kwargs.get('editgroup_description')
+        self.do_updates = bool(kwargs.get("do_updates", True))
+        self.do_fuzzy_match: bool = kwargs.get("do_fuzzy_match", True)
+        self.bezerk_mode: bool = kwargs.get("bezerk_mode", False)
+        self.submit_mode: bool = kwargs.get("submit_mode", False)
+        self.edit_batch_size: int = kwargs.get("edit_batch_size", 100)
+        self.editgroup_description: Optional[str] = kwargs.get("editgroup_description")
         self.editgroup_extra: Optional[Any] = eg_extra
 
-        self.es_client = kwargs.get('es_client')
+        self.es_client = kwargs.get("es_client")
         if not self.es_client:
-            self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120)
+            self.es_client = elasticsearch.Elasticsearch(
+                "https://search.fatcat.wiki", timeout=120
+            )
 
         self._issnl_id_map: Dict[str, Any] = dict()
         self._orcid_id_map: Dict[str, Any] = dict()
@@ -174,7 +177,7 @@ class EntityImporter:
         self.reset()
 
     def reset(self) -> None:
-        self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+        self.counts = Counter({"total": 0, "skip": 0, "insert": 0, "update": 0, "exists": 0})
         self._edit_count: int = 0
         self._editgroup_id: Optional[str] = None
         self._entity_queue: List[Any] = []
@@ -184,13 +187,13 @@ class EntityImporter:
         """
         Returns nothing.
         """
-        self.counts['total'] += 1
+        self.counts["total"] += 1
         if (not raw_record) or (not self.want(raw_record)):
-            self.counts['skip'] += 1
+            self.counts["skip"] += 1
             return
         entity = self.parse_record(raw_record)
         if not entity:
-            self.counts['skip'] += 1
+            self.counts["skip"] += 1
             return
         if self.bezerk_mode:
             self.push_entity(entity)
@@ -230,7 +233,7 @@ class EntityImporter:
 
         if self._entity_queue:
             self.insert_batch(self._entity_queue)
-            self.counts['insert'] += len(self._entity_queue)
+            self.counts["insert"] += len(self._entity_queue)
             self._entity_queue = []
 
         return self.counts
@@ -248,8 +251,9 @@ class EntityImporter:
         if not self._editgroup_id:
             eg = self.api.create_editgroup(
                 fatcat_openapi_client.Editgroup(
-                    description=self.editgroup_description,
-                    extra=self.editgroup_extra))
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                )
+            )
             self._editgroup_id = eg.editgroup_id
 
         self._edit_count += edits
@@ -257,30 +261,30 @@ class EntityImporter:
 
     def create_container(self, entity):
         eg_id = self.get_editgroup_id()
-        self.counts['inserted.container'] += 1
+        self.counts["inserted.container"] += 1
         return self.api.create_container(eg_id, entity)
 
     def create_release(self, entity):
         eg_id = self.get_editgroup_id()
-        self.counts['inserted.release'] += 1
+        self.counts["inserted.release"] += 1
         return self.api.create_release(eg_id, entity)
 
     def create_file(self, entity):
         eg_id = self.get_editgroup_id()
-        self.counts['inserted.file'] += 1
+        self.counts["inserted.file"] += 1
         return self.api.create_file(eg_id, entity)
 
     def updated(self):
         """
         Implementations should call this from try_update() if the update was successful
         """
-        self.counts['update'] += 1
+        self.counts["update"] += 1
 
     def push_entity(self, entity):
         self._entity_queue.append(entity)
         if len(self._entity_queue) >= self.edit_batch_size:
             self.insert_batch(self._entity_queue)
-            self.counts['insert'] += len(self._entity_queue)
+            self.counts["insert"] += len(self._entity_queue)
             self._entity_queue = []
 
     def want(self, raw_record: Any) -> bool:
@@ -324,7 +328,7 @@ class EntityImporter:
             # If anything other than a 404 (not found), something is wrong
             if ae.status != 404:
                 raise ae
-        self._orcid_id_map[orcid] = creator_id # might be None
+        self._orcid_id_map[orcid] = creator_id  # might be None
         return creator_id
 
     def is_doi(self, doi: str) -> bool:
@@ -347,7 +351,7 @@ class EntityImporter:
             # If anything other than a 404 (not found), something is wrong
             if ae.status != 404:
                 raise ae
-        self._doi_id_map[doi] = release_id # might be None
+        self._doi_id_map[doi] = release_id  # might be None
         return release_id
 
     def lookup_pmid(self, pmid: str):
@@ -364,11 +368,11 @@ class EntityImporter:
             # If anything other than a 404 (not found), something is wrong
             if ae.status != 404:
                 raise ae
-        self._pmid_id_map[pmid] = release_id # might be None
+        self._pmid_id_map[pmid] = release_id  # might be None
         return release_id
 
     def is_issnl(self, issnl: str) -> bool:
-        return len(issnl) == 9 and issnl[4] == '-'
+        return len(issnl) == 9 and issnl[4] == "-"
 
     def lookup_issnl(self, issnl: str):
         """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
@@ -382,7 +386,7 @@ class EntityImporter:
             # If anything other than a 404 (not found), something is wrong
             if ae.status != 404:
                 raise ae
-        self._issnl_id_map[issnl] = container_id # might be None
+        self._issnl_id_map[issnl] = container_id  # might be None
         return container_id
 
     def read_issn_map_file(self, issn_map_file):
@@ -417,26 +421,26 @@ class EntityImporter:
         # update old/deprecated 'rel' on URLs
         for i in range(len(existing.urls)):
             u = existing.urls[i]
-            if u.rel == 'repository' and '://archive.org/download/' in u.url:
-                existing.urls[i].rel = 'archive'
-            if u.rel == 'social':
-                u.rel = 'academicsocial'
+            if u.rel == "repository" and "://archive.org/download/" in u.url:
+                existing.urls[i].rel = "archive"
+            if u.rel == "social":
+                u.rel = "academicsocial"
 
         # remove URLs which are near-duplicates
         redundant_urls = []
         all_urls = [u.url for u in existing.urls]
-        all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url]
+        all_wayback_urls = [u.url for u in existing.urls if "://web.archive.org/web/" in u.url]
         for url in all_urls:
             # https/http redundancy
-            if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls:
+            if url.startswith("http://") and url.replace("http://", "https://", 1) in all_urls:
                 redundant_urls.append(url)
                 continue
             # default HTTP port included and not included
-            if ':80/' in url and url.replace(':80', '', 1) in all_urls:
+            if ":80/" in url and url.replace(":80", "", 1) in all_urls:
                 redundant_urls.append(url)
                 continue
             # partial and complete wayback timestamps
-            if '://web.archive.org/web/2017/' in url:
+            if "://web.archive.org/web/2017/" in url:
                 original_url = "/".join(url.split("/")[5:])
                 assert len(original_url) > 5
                 for wb_url in all_wayback_urls:
@@ -452,7 +456,9 @@ class EntityImporter:
     def generic_fileset_cleanups(existing):
         return existing
 
-    def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]:
+    def match_existing_release_fuzzy(
+        self, release: ReleaseEntity
+    ) -> Optional[Tuple[str, str, ReleaseEntity]]:
         """
         This helper function uses fuzzycat (and elasticsearch) to look for
         existing release entities with similar metadata.
@@ -488,7 +494,15 @@ class EntityImporter:
             return None
 
         release_dict = entity_to_dict(release, api_client=self.api.api_client)
-        verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates]
+        verified = [
+            (
+                fuzzycat.verify.verify(
+                    release_dict, entity_to_dict(c, api_client=self.api.api_client)
+                ),
+                c,
+            )
+            for c in candidates
+        ]
 
         # chose the "closest" match
         closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
@@ -522,7 +536,6 @@ class RecordPusher:
 
 
 class JsonLinePusher(RecordPusher):
-
     def __init__(self, importer, json_file, **kwargs):
         self.importer = importer
         self.json_file = json_file
@@ -539,10 +552,9 @@ class JsonLinePusher(RecordPusher):
 
 
 class CsvPusher(RecordPusher):
-
     def __init__(self, importer, csv_file, **kwargs):
         self.importer = importer
-        self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ','))
+        self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ","))
 
     def run(self):
         for line in self.reader:
@@ -555,7 +567,6 @@ class CsvPusher(RecordPusher):
 
 
 class LinePusher(RecordPusher):
-
     def __init__(self, importer, text_file, **kwargs):
         self.importer = importer
         self.text_file = text_file
@@ -571,17 +582,15 @@ class LinePusher(RecordPusher):
 
 
 class SqlitePusher(RecordPusher):
-
     def __init__(self, importer, db_file, table_name, where_clause="", **kwargs):
         self.importer = importer
-        self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+        self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
         self.db.row_factory = sqlite3.Row
         self.table_name = table_name
         self.where_clause = where_clause
 
     def run(self):
-        cur = self.db.execute("SELECT * FROM {} {};".format(
-            self.table_name, self.where_clause))
+        cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause))
         for row in cur:
             self.importer.push_record(row)
         counts = self.importer.finish()
@@ -590,7 +599,6 @@ class SqlitePusher(RecordPusher):
 
 
 class Bs4XmlLinesPusher(RecordPusher):
-
     def __init__(self, importer, xml_file, prefix_filter=None, **kwargs):
         self.importer = importer
         self.xml_file = xml_file
@@ -611,7 +619,6 @@ class Bs4XmlLinesPusher(RecordPusher):
 
 
 class Bs4XmlFilePusher(RecordPusher):
-
     def __init__(self, importer, xml_file, record_tag, **kwargs):
         self.importer = importer
         self.xml_file = xml_file
@@ -684,7 +691,6 @@ class Bs4XmlLargeFilePusher(RecordPusher):
 
 
 class Bs4XmlFileListPusher(RecordPusher):
-
     def __init__(self, importer, list_file, record_tag, **kwargs):
         self.importer = importer
         self.list_file = list_file
@@ -695,7 +701,7 @@ class Bs4XmlFileListPusher(RecordPusher):
             xml_path = xml_path.strip()
             if not xml_path or xml_path.startswith("#"):
                 continue
-            with open(xml_path, 'r') as xml_file:
+            with open(xml_path, "r") as xml_file:
                 soup = BeautifulSoup(xml_file, "xml")
                 for record in soup.find_all(self.record_tag):
                     self.importer.push_record(record)
@@ -705,10 +711,12 @@ class Bs4XmlFileListPusher(RecordPusher):
         print(counts)
         return counts
 
+
 class KafkaBs4XmlPusher(RecordPusher):
     """
     Fetch XML for an article from Kafka, parse via Bs4.
     """
+
     def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
         self.importer = importer
         self.consumer = make_kafka_consumer(
@@ -716,10 +724,10 @@ class KafkaBs4XmlPusher(RecordPusher):
             kafka_env,
             topic_suffix,
             group,
-            kafka_namespace=kwargs.get('kafka_namespace', 'fatcat')
+            kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),
         )
-        self.poll_interval = kwargs.get('poll_interval', 5.0)
-        self.consume_batch_size = kwargs.get('consume_batch_size', 25)
+        self.poll_interval = kwargs.get("poll_interval", 5.0)
+        self.consume_batch_size = kwargs.get("consume_batch_size", 25)
 
     def run(self):
         count = 0
@@ -735,16 +743,19 @@ class KafkaBs4XmlPusher(RecordPusher):
             # outstanding editgroups every 5 minutes, but there is still that
             # window when editgroups might be hanging (unsubmitted).
             batch = self.consumer.consume(
-                num_messages=self.consume_batch_size,
-                timeout=self.poll_interval)
-            print("... got {} kafka messages ({}sec poll interval) {}".format(
-                len(batch), self.poll_interval, self.importer.counts))
+                num_messages=self.consume_batch_size, timeout=self.poll_interval
+            )
+            print(
+                "... got {} kafka messages ({}sec poll interval) {}".format(
+                    len(batch), self.poll_interval, self.importer.counts
+                )
+            )
             if not batch:
                 if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5):
                     # it has been some time, so flush any current editgroup
                     self.importer.finish()
                     last_push = datetime.datetime.now()
-                    #print("Flushed any partial import batch: {}".format(self.importer.counts))
+                    # print("Flushed any partial import batch: {}".format(self.importer.counts))
                 continue
             # first check errors on entire batch...
             for msg in batch:
@@ -752,7 +763,7 @@ class KafkaBs4XmlPusher(RecordPusher):
                     raise KafkaException(msg.error())
             # ... then process
             for msg in batch:
-                soup = BeautifulSoup(msg.value().decode('utf-8'), "xml")
+                soup = BeautifulSoup(msg.value().decode("utf-8"), "xml")
                 self.importer.push_record(soup)
                 soup.decompose()
                 count += 1
@@ -771,8 +782,8 @@ class KafkaBs4XmlPusher(RecordPusher):
         self.consumer.close()
         return counts
 
-class KafkaJsonPusher(RecordPusher):
 
+class KafkaJsonPusher(RecordPusher):
     def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
         self.importer = importer
         self.consumer = make_kafka_consumer(
@@ -780,11 +791,11 @@ class KafkaJsonPusher(RecordPusher):
             kafka_env,
             topic_suffix,
             group,
-            kafka_namespace=kwargs.get('kafka_namespace', 'fatcat')
+            kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),
         )
-        self.poll_interval = kwargs.get('poll_interval', 5.0)
-        self.consume_batch_size = kwargs.get('consume_batch_size', 100)
-        self.force_flush = kwargs.get('force_flush', False)
+        self.poll_interval = kwargs.get("poll_interval", 5.0)
+        self.consume_batch_size = kwargs.get("consume_batch_size", 100)
+        self.force_flush = kwargs.get("force_flush", False)
 
     def run(self):
         count = 0
@@ -801,10 +812,13 @@ class KafkaJsonPusher(RecordPusher):
             # outstanding editgroups every 5 minutes, but there is still that
             # window when editgroups might be hanging (unsubmitted).
             batch = self.consumer.consume(
-                num_messages=self.consume_batch_size,
-                timeout=self.poll_interval)
-            print("... got {} kafka messages ({}sec poll interval) {}".format(
-                len(batch), self.poll_interval, self.importer.counts))
+                num_messages=self.consume_batch_size, timeout=self.poll_interval
+            )
+            print(
+                "... got {} kafka messages ({}sec poll interval) {}".format(
+                    len(batch), self.poll_interval, self.importer.counts
+                )
+            )
             if self.force_flush:
                 # this flushing happens even if there have been 'push' events
                 # more recently. it is intended for, eg, importers off the
@@ -821,7 +835,7 @@ class KafkaJsonPusher(RecordPusher):
                     self.importer.finish()
                     last_push = datetime.datetime.now()
                     last_force_flush = datetime.datetime.now()
-                    #print("Flushed any partial import batch: {}".format(self.importer.counts))
+                    # print("Flushed any partial import batch: {}".format(self.importer.counts))
                 continue
             # first check errors on entire batch...
             for msg in batch:
@@ -829,7 +843,7 @@ class KafkaJsonPusher(RecordPusher):
                     raise KafkaException(msg.error())
             # ... then process
             for msg in batch:
-                record = json.loads(msg.value().decode('utf-8'))
+                record = json.loads(msg.value().decode("utf-8"))
                 self.importer.push_record(record)
                 count += 1
                 if count % 500 == 0:
@@ -864,25 +878,25 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat
                 print("Bailing out...")
                 # TODO: should it be sys.exit(-1)?
                 raise KafkaException(p.error)
-        #print("Kafka consumer commit successful")
+        # print("Kafka consumer commit successful")
         pass
 
     # previously, using pykafka
-    #auto_commit_enable=True,
-    #auto_commit_interval_ms=30000, # 30 seconds
+    # auto_commit_enable=True,
+    # auto_commit_interval_ms=30000, # 30 seconds
     conf = {
-        'bootstrap.servers': hosts,
-        'group.id': group,
-        'on_commit': fail_fast,
+        "bootstrap.servers": hosts,
+        "group.id": group,
+        "on_commit": fail_fast,
         # messages don't have offset marked as stored until pushed to
         # elastic, but we do auto-commit stored offsets to broker
-        'enable.auto.offset.store': False,
-        'enable.auto.commit': True,
+        "enable.auto.offset.store": False,
+        "enable.auto.commit": True,
         # user code timeout; if no poll after this long, assume user code
         # hung and rebalance (default: 5min)
-        'max.poll.interval.ms': 120000,
-        'default.topic.config': {
-            'auto.offset.reset': 'latest',
+        "max.poll.interval.ms": 120000,
+        "default.topic.config": {
+            "auto.offset.reset": "latest",
         },
     }
 
@@ -890,13 +904,13 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat
         for p in partitions:
             if p.error:
                 raise KafkaException(p.error)
-        print("Kafka partitions rebalanced: {} / {}".format(
-            consumer, partitions))
+        print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions))
 
     consumer = Consumer(conf)
     # NOTE: it's actually important that topic_name *not* be bytes (UTF-8
     # encoded)
-    consumer.subscribe([topic_name],
+    consumer.subscribe(
+        [topic_name],
         on_assign=on_rebalance,
         on_revoke=on_rebalance,
     )
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fd6936a4..606d4bb1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,4 +1,3 @@
-
 import datetime
 import sqlite3
 from typing import Any, Dict, Optional
@@ -13,30 +12,30 @@ from .common import EntityImporter, clean
 # Can get a list of Crossref types (with counts) via API:
 # https://api.crossref.org/works?rows=0&facet=type-name:*
 CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
-    'book': 'book',
-    'book-chapter': 'chapter',
-    'book-part': 'chapter',
-    'book-section': 'chapter',
-    'component': 'component',
-    'dataset': 'dataset',
-    'dissertation': 'thesis',
-    'edited-book': 'book',
-    'journal-article': 'article-journal',
-    'monograph': 'book',
-    'other': None,
-    'peer-review': 'peer_review',
-    'posted-content': 'post',
-    'proceedings-article': 'paper-conference',
-    'reference-book': 'book',
-    'reference-entry': 'entry',
-    'report': 'report',
-    'standard': 'standard',
+    "book": "book",
+    "book-chapter": "chapter",
+    "book-part": "chapter",
+    "book-section": "chapter",
+    "component": "component",
+    "dataset": "dataset",
+    "dissertation": "thesis",
+    "edited-book": "book",
+    "journal-article": "article-journal",
+    "monograph": "book",
+    "other": None,
+    "peer-review": "peer_review",
+    "posted-content": "post",
+    "proceedings-article": "paper-conference",
+    "reference-book": "book",
+    "reference-entry": "entry",
+    "report": "report",
+    "standard": "standard",
 }
 
 CONTAINER_TYPE_MAP: Dict[str, str] = {
-    'article-journal': 'journal',
-    'paper-conference': 'conference',
-    'book': 'book-series',
+    "article-journal": "journal",
+    "paper-conference": "conference",
+    "book": "book-series",
 }
 
 # These are based, informally, on sorting the most popular licenses found in
@@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = {
     "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
 }
 
+
 def lookup_license_slug(raw: str) -> Optional[str]:
     if not raw:
         return None
-    raw = raw.strip().replace('http://', '//').replace('https://', '//')
-    if 'creativecommons.org' in raw.lower():
+    raw = raw.strip().replace("http://", "//").replace("https://", "//")
+    if "creativecommons.org" in raw.lower():
         raw = raw.lower()
-        raw = raw.replace('/legalcode', '/').replace('/uk', '')
-        if not raw.endswith('/'):
-            raw = raw + '/'
+        raw = raw.replace("/legalcode", "/").replace("/uk", "")
+        if not raw.endswith("/"):
+            raw = raw + "/"
     return LICENSE_SLUG_MAP.get(raw)
 
+
 def test_lookup_license_slug():
 
     assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
-    assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY"
-    assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0"
+    assert (
+        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+        == "CC-BY"
+    )
+    assert (
+        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+        == "CC-0"
+    )
     assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
-    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA"
+    assert (
+        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+        == "CC-BY-NC-SA"
+    )
     assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
     assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
     assert lookup_license_slug("") is None
     assert lookup_license_slug(None) is None
 
+
 class CrossrefImporter(EntityImporter):
     """
     Importer for Crossref metadata.
@@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter):
 
     def __init__(self, api, issn_map_file, **kwargs):
 
-        eg_desc: Optional[str] = kwargs.get('editgroup_description',
-            "Automated import of Crossref DOI metadata, harvested from REST API")
-        eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter')
-        super().__init__(api,
+        eg_desc: Optional[str] = kwargs.get(
+            "editgroup_description",
+            "Automated import of Crossref DOI metadata, harvested from REST API",
+        )
+        eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter")
+        super().__init__(
+            api,
             issn_map_file=issn_map_file,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
-            **kwargs)
+            **kwargs
+        )
 
-        self.create_containers: bool = kwargs.get('create_containers', True)
-        extid_map_file = kwargs.get('extid_map_file')
+        self.create_containers: bool = kwargs.get("create_containers", True)
+        extid_map_file = kwargs.get("extid_map_file")
         self.extid_map_db: Optional[Any] = None
         if extid_map_file:
             db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter):
 
     def lookup_ext_ids(self, doi: str) -> Optional[Any]:
         if self.extid_map_db is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
-        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
-            [doi.lower()]).fetchone()
+            return dict(
+                core_id=None,
+                pmid=None,
+                pmcid=None,
+                wikidata_qid=None,
+                arxiv_id=None,
+                jstor_id=None,
+            )
+        row = self.extid_map_db.execute(
+            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+        ).fetchone()
         if row is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
-        row = [str(cell or '') or None for cell in row]
+            return dict(
+                core_id=None,
+                pmid=None,
+                pmcid=None,
+                wikidata_qid=None,
+                arxiv_id=None,
+                jstor_id=None,
+            )
+        row = [str(cell or "") or None for cell in row]
         return dict(
             core_id=row[0],
             pmid=row[1],
@@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter):
         return CONTAINER_TYPE_MAP.get(crossref_type)
 
     def want(self, obj: Dict[str, Any]) -> bool:
-        if not obj.get('title'):
-            self.counts['skip-blank-title'] += 1
+        if not obj.get("title"):
+            self.counts["skip-blank-title"] += 1
             return False
 
         # these are pre-registered DOIs before the actual record is ready
         # title is a list of titles
-        titles = obj.get('title')
+        titles = obj.get("title")
         if titles is not None and titles[0].strip().lower() in [
-                "OUP accepted manuscript".lower(),
-            ]:
-            self.counts['skip-stub-title'] += 1
+            "OUP accepted manuscript".lower(),
+        ]:
+            self.counts["skip-stub-title"] += 1
             return False
 
         # do most of these checks in-line below
@@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter):
 
         # Ways to be out of scope (provisionally)
         # journal-issue and journal-volume map to None, but allowed for now
-        if obj.get('type') in (None, 'journal', 'proceedings',
-                'standard-series', 'report-series', 'book-series', 'book-set',
-                'book-track', 'proceedings-series'):
-            self.counts['skip-release-type'] += 1
+        if obj.get("type") in (
+            None,
+            "journal",
+            "proceedings",
+            "standard-series",
+            "report-series",
+            "book-series",
+            "book-set",
+            "book-track",
+            "proceedings-series",
+        ):
+            self.counts["skip-release-type"] += 1
             return None
 
         # Do require the 'title' keys to exist, as release entities do
-        if ('title' not in obj) or (not obj['title']):
-            self.counts['skip-blank-title'] += 1
+        if ("title" not in obj) or (not obj["title"]):
+            self.counts["skip-blank-title"] += 1
             return None
 
-        release_type = self.map_release_type(obj['type'])
+        release_type = self.map_release_type(obj["type"])
 
         # contribs
         def do_contribs(obj_list, ctype):
             contribs = []
             for i, am in enumerate(obj_list):
                 creator_id = None
-                if 'ORCID' in am.keys():
-                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
+                if "ORCID" in am.keys():
+                    creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
                 # Sorry humans :(
-                if am.get('given') and am.get('family'):
-                    raw_name = "{} {}".format(am['given'], am['family'])
-                elif am.get('family'):
-                    raw_name = am['family']
+                if am.get("given") and am.get("family"):
+                    raw_name = "{} {}".format(am["given"], am["family"])
+                elif am.get("family"):
+                    raw_name = am["family"]
                 else:
                     # TODO: can end up empty
-                    raw_name = am.get('name') or am.get('given')
+                    raw_name = am.get("name") or am.get("given")
                 extra = dict()
                 if ctype == "author":
                     index = i
                 else:
                     index = None
                 raw_affiliation = None
-                if am.get('affiliation'):
-                    if len(am.get('affiliation')) > 0:
-                        raw_affiliation = am.get('affiliation')[0]['name']
-                    if len(am.get('affiliation')) > 1:
+                if am.get("affiliation"):
+                    if len(am.get("affiliation")) > 0:
+                        raw_affiliation = am.get("affiliation")[0]["name"]
+                    if len(am.get("affiliation")) > 1:
                         # note: affiliation => more_affiliations
-                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
-                if am.get('sequence') and am.get('sequence') != "additional":
-                    extra['seq'] = clean(am.get('sequence'))
+                        extra["more_affiliations"] = [
+                            clean(a["name"]) for a in am.get("affiliation")[1:]
+                        ]
+                if am.get("sequence") and am.get("sequence") != "additional":
+                    extra["seq"] = clean(am.get("sequence"))
                 if not extra:
                     extra = None
                 assert ctype in ("author", "editor", "translator")
                 raw_name = clean(raw_name)
-                contribs.append(fatcat_openapi_client.ReleaseContrib(
-                    creator_id=creator_id,
-                    index=index,
-                    raw_name=raw_name,
-                    given_name=clean(am.get('given')),
-                    surname=clean(am.get('family')),
-                    raw_affiliation=clean(raw_affiliation),
-                    role=ctype,
-                    extra=extra))
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        creator_id=creator_id,
+                        index=index,
+                        raw_name=raw_name,
+                        given_name=clean(am.get("given")),
+                        surname=clean(am.get("family")),
+                        raw_affiliation=clean(raw_affiliation),
+                        role=ctype,
+                        extra=extra,
+                    )
+                )
             return contribs
-        contribs = do_contribs(obj.get('author', []), "author")
-        contribs.extend(do_contribs(obj.get('editor', []), "editor"))
-        contribs.extend(do_contribs(obj.get('translator', []), "translator"))
+
+        contribs = do_contribs(obj.get("author", []), "author")
+        contribs.extend(do_contribs(obj.get("editor", []), "editor"))
+        contribs.extend(do_contribs(obj.get("translator", []), "translator"))
 
         # container
-        issn = obj.get('ISSN', [None])[0]
+        issn = obj.get("ISSN", [None])[0]
         issnl = self.issn2issnl(issn)
         container_id = None
         if issnl:
             container_id = self.lookup_issnl(issnl)
-        publisher = clean(obj.get('publisher'))
+        publisher = clean(obj.get("publisher"))
 
-        container_name = obj.get('container-title')
+        container_name = obj.get("container-title")
         if container_name:
             container_name = clean(container_name[0], force_xml=True)
         if not container_name:
             container_name = None
-        if (container_id is None and self.create_containers and (issnl is not None)
-                and container_name):
+        if (
+            container_id is None
+            and self.create_containers
+            and (issnl is not None)
+            and container_name
+        ):
             ce = fatcat_openapi_client.ContainerEntity(
                 issnl=issnl,
                 publisher=publisher,
                 container_type=self.map_container_type(release_type),
-                name=container_name)
+                name=container_name,
+            )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
             self._issnl_id_map[issnl] = container_id
@@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter):
         # license slug
         license_slug = None
         license_extra = []
-        for lic in obj.get('license', []):
-            if lic['content-version'] not in ('vor', 'unspecified'):
+        for lic in obj.get("license", []):
+            if lic["content-version"] not in ("vor", "unspecified"):
                 continue
-            slug = lookup_license_slug(lic['URL'])
+            slug = lookup_license_slug(lic["URL"])
             if slug:
                 license_slug = slug
-            if 'start' in lic:
-                lic['start'] = lic['start']['date-time']
+            if "start" in lic:
+                lic["start"] = lic["start"]["date-time"]
             license_extra.append(lic)
 
         # references
         refs = []
-        for i, rm in enumerate(obj.get('reference', [])):
+        for i, rm in enumerate(obj.get("reference", [])):
             try:
-                year: Optional[int] = int(rm.get('year'))
+                year: Optional[int] = int(rm.get("year"))
                 # TODO: will need to update/config in the future!
                 # NOTE: are there crossref works with year < 100?
                 if year is not None:
@@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter):
             except (TypeError, ValueError):
                 year = None
             ref_extra: Dict[str, Any] = dict()
-            key = rm.get('key')
-            if key and key.startswith(obj['DOI'].upper()):
-                key = key.replace(obj['DOI'].upper() + "-", '')
-                key = key.replace(obj['DOI'].upper(), '')
-            ref_container_name = rm.get('volume-title')
+            key = rm.get("key")
+            if key and key.startswith(obj["DOI"].upper()):
+                key = key.replace(obj["DOI"].upper() + "-", "")
+                key = key.replace(obj["DOI"].upper(), "")
+            ref_container_name = rm.get("volume-title")
             if not ref_container_name:
-                ref_container_name = rm.get('journal-title')
-            elif rm.get('journal-title'):
-                ref_extra['journal-title'] = rm['journal-title']
-            if rm.get('DOI'):
-                ref_extra['doi'] = rm.get('DOI').lower()
-            author = clean(rm.get('author'))
+                ref_container_name = rm.get("journal-title")
+            elif rm.get("journal-title"):
+                ref_extra["journal-title"] = rm["journal-title"]
+            if rm.get("DOI"):
+                ref_extra["doi"] = rm.get("DOI").lower()
+            author = clean(rm.get("author"))
             if author:
-                ref_extra['authors'] = [author]
-            for k in ('editor', 'edition', 'authority', 'version', 'genre',
-                    'url', 'event', 'issue', 'volume', 'date', 'accessed_date',
-                    'issued', 'page', 'medium', 'collection_title', 'chapter_number',
-                    'unstructured', 'series-title', 'volume-title'):
+                ref_extra["authors"] = [author]
+            for k in (
+                "editor",
+                "edition",
+                "authority",
+                "version",
+                "genre",
+                "url",
+                "event",
+                "issue",
+                "volume",
+                "date",
+                "accessed_date",
+                "issued",
+                "page",
+                "medium",
+                "collection_title",
+                "chapter_number",
+                "unstructured",
+                "series-title",
+                "volume-title",
+            ):
                 if clean(rm.get(k)):
                     ref_extra[k] = clean(rm[k])
             if not ref_extra:
                 ref_extra = None
-            refs.append(fatcat_openapi_client.ReleaseRef(
-                index=i,
-                # doing lookups would be a second import pass
-                target_release_id=None,
-                key=key,
-                year=year,
-                container_name=clean(ref_container_name),
-                title=clean(rm.get('article-title')),
-                locator=clean(rm.get('first-page')),
-                # TODO: just dump JSON somewhere here?
-                extra=ref_extra))
+            refs.append(
+                fatcat_openapi_client.ReleaseRef(
+                    index=i,
+                    # doing lookups would be a second import pass
+                    target_release_id=None,
+                    key=key,
+                    year=year,
+                    container_name=clean(ref_container_name),
+                    title=clean(rm.get("article-title")),
+                    locator=clean(rm.get("first-page")),
+                    # TODO: just dump JSON somewhere here?
+                    extra=ref_extra,
+                )
+            )
 
         # abstracts
         abstracts = []
-        abstract = clean(obj.get('abstract'))
+        abstract = clean(obj.get("abstract"))
         if abstract and len(abstract) > 10:
-            abstracts.append(fatcat_openapi_client.ReleaseAbstract(
-                mimetype="application/xml+jats",
-                content=abstract))
+            abstracts.append(
+                fatcat_openapi_client.ReleaseAbstract(
+                    mimetype="application/xml+jats", content=abstract
+                )
+            )
 
         # extra fields
         extra = dict()
         extra_crossref = dict()
         # top-level extra keys
         if not container_id:
-            if obj.get('container-title'):
-                extra['container_name'] = container_name
-        for key in ('group-title'):
+            if obj.get("container-title"):
+                extra["container_name"] = container_name
+        for key in "group-title":
             val = obj.get(key)
             if val:
                 if type(val) == list:
@@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter):
                 else:
                     extra[key] = val
         # crossref-nested extra keys
-        for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
+        for key in ("subject", "type", "alternative-id", "archive", "funder"):
             val = obj.get(key)
             if val:
                 if type(val) == str:
@@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter):
                 else:
                     extra_crossref[key] = val
         if license_extra:
-            extra_crossref['license'] = license_extra
+            extra_crossref["license"] = license_extra
 
-        if len(obj['title']) > 1:
-            aliases = [clean(t) for t in obj['title'][1:]]
+        if len(obj["title"]) > 1:
+            aliases = [clean(t) for t in obj["title"][1:]]
             aliases = [t for t in aliases if t]
             if aliases:
-                extra['aliases'] = aliases
+                extra["aliases"] = aliases
 
         # ISBN
         isbn13 = None
-        for raw in obj.get('ISBN', []):
+        for raw in obj.get("ISBN", []):
             # TODO: convert if not ISBN-13 format
             if len(raw) == 17:
                 isbn13 = raw
                 break
 
         # release status
-        if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
-                'dissertation', 'book-chapter'):
+        if obj["type"] in (
+            "journal-article",
+            "conference-proceeding",
+            "book",
+            "dissertation",
+            "book-chapter",
+        ):
             release_stage = "published"
         else:
             # unknown
             release_stage = None
 
         # external identifiers
-        extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower())
+        extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())
 
         # filter out unreasonably huge releases
         if len(abstracts) > 100:
-            self.counts['skip-huge-abstracts'] += 1
+            self.counts["skip-huge-abstracts"] += 1
             return None
         if len(contribs) > 2000:
-            self.counts['skip-huge-contribs'] += 1
+            self.counts["skip-huge-contribs"] += 1
             return None
         if len(refs) > 5000:
-            self.counts['skip-huge-refs'] += 1
+            self.counts["skip-huge-refs"] += 1
             return None
 
         # release date parsing is amazingly complex
-        raw_date = obj['issued']['date-parts'][0]
+        raw_date = obj["issued"]["date-parts"][0]
         if not raw_date or not raw_date[0]:
             # got some NoneType, even though at least year is supposed to be set
             release_year = None
@@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter):
             release_date = None
 
         original_title: Optional[str] = None
-        if obj.get('original-title'):
-            ot = obj.get('original-title')
+        if obj.get("original-title"):
+            ot = obj.get("original-title")
             if ot is not None:
                 original_title = clean(ot[0], force_xml=True)
 
         title: Optional[str] = None
-        if obj.get('title'):
-            title = clean(obj.get('title')[0], force_xml=True)
+        if obj.get("title"):
+            title = clean(obj.get("title")[0], force_xml=True)
             if not title or len(title) <= 1:
                 # title can't be just a single character
-                self.counts['skip-blank-title'] += 1
+                self.counts["skip-blank-title"] += 1
                 return None
 
         subtitle = None
-        if obj.get('subtitle'):
-            subtitle = clean(obj.get('subtitle')[0], force_xml=True)
+        if obj.get("subtitle"):
+            subtitle = clean(obj.get("subtitle")[0], force_xml=True)
             if not subtitle or len(subtitle) <= 1:
                 # subtitle can't be just a single character
                 subtitle = None
 
         if extra_crossref:
-            extra['crossref'] = extra_crossref
+            extra["crossref"] = extra_crossref
         if not extra:
             extra = None
 
@@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter):
             release_year=release_year,
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=obj['DOI'].lower(),
-                pmid=extids['pmid'],
-                pmcid=extids['pmcid'],
-                wikidata_qid=extids['wikidata_qid'],
+                doi=obj["DOI"].lower(),
+                pmid=extids["pmid"],
+                pmcid=extids["pmcid"],
+                wikidata_qid=extids["wikidata_qid"],
                 isbn13=isbn13,
-                core=extids['core_id'],
-                arxiv=extids['arxiv_id'],
-                jstor=extids['jstor_id'],
+                core=extids["core_id"],
+                arxiv=extids["arxiv_id"],
+                jstor=extids["jstor_id"],
             ),
-            volume=clean(obj.get('volume')),
-            issue=clean(obj.get('issue')),
-            pages=clean(obj.get('page')),
-            language=clean(obj.get('language')),
+            volume=clean(obj.get("volume")),
+            issue=clean(obj.get("issue")),
+            pages=clean(obj.get("page")),
+            language=clean(obj.get("language")),
             license_slug=license_slug,
             extra=extra,
             abstracts=abstracts,
@@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter):
         # eventually we'll want to support "updates", but for now just skip if
         # entity already exists
         if existing:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         return True
 
     def insert_batch(self, batch):
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a06c68a4..4c174b0b 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -30,126 +30,130 @@ MAX_ABSTRACT_LENGTH = 2048
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
-    'Journal': 'journal',
-    'Series': 'journal',
-    'Book Series': 'book-series',
+    "Journal": "journal",
+    "Series": "journal",
+    "Book Series": "book-series",
 }
 
 # The docs/guide should be the canonical home for these mappings; update there
 # first.  Map various datacite type types to CSL-ish types. None means TODO or
 # remove.
 DATACITE_TYPE_MAP = {
-    'ris': {
-        'THES': 'thesis',
-        'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report)
-        'CHAP': 'chapter',
-        'FIGURE': 'figure',
-        'RPRT': 'report',
-        'JOUR': 'article-journal',
-        'MPCT': 'motion_picture',
-        'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset
-        'BOOK': 'book',
-        'DATA': 'dataset',
-        'COMP': 'software',
+    "ris": {
+        "THES": "thesis",
+        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report)
+        "CHAP": "chapter",
+        "FIGURE": "figure",
+        "RPRT": "report",
+        "JOUR": "article-journal",
+        "MPCT": "motion_picture",
+        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+        "BOOK": "book",
+        "DATA": "dataset",
+        "COMP": "software",
     },
-    'schemaOrg': {
-        'Dataset': 'dataset',
-        'Book': 'book',
-        'ScholarlyArticle': 'article-journal',
-        'ImageObject': 'graphic',
-        'Collection': None,
-        'MediaObject': None,
-        'Event': None,
-        'SoftwareSourceCode': 'software',
-        'Chapter': 'chapter',
-        'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
-        'PublicationIssue': 'article',
-        'AudioObject': None,
-        'Thesis': 'thesis',
+    "schemaOrg": {
+        "Dataset": "dataset",
+        "Book": "book",
+        "ScholarlyArticle": "article-journal",
+        "ImageObject": "graphic",
+        "Collection": None,
+        "MediaObject": None,
+        "Event": None,
+        "SoftwareSourceCode": "software",
+        "Chapter": "chapter",
+        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+        "PublicationIssue": "article",
+        "AudioObject": None,
+        "Thesis": "thesis",
     },
-    'citeproc': {
-        'article': 'article',
-        'article-journal': 'article-journal',
-        'article-magazine': 'article-magazine',
-        'article-newspaper': 'article-newspaper',
-        'bill': 'bill',
-        'book': 'book',
-        'broadcast': 'broadcast',
-        'chapter': 'chapter',
-        'dataset': 'dataset',
-        'entry-dictionary': 'entry-dictionary',
-        'entry-encyclopedia': 'entry-encyclopedia',
-        'entry': 'entry',
-        'figure': 'figure',
-        'graphic': 'graphic',
-        'interview': 'interview',
-        'legal_case': 'legal_case',
-        'legislation': 'legislation',
-        'manuscript': 'manuscript',
-        'map': 'map',
-        'motion_picture': 'motion_picture',
-        'musical_score': 'musical_score',
-        'pamphlet': 'pamphlet',
-        'paper-conference': 'paper-conference',
-        'patent': 'patent',
-        'personal_communication': 'personal_communication',
-        'post': 'post',
-        'post-weblog': 'post-weblog',
-        'report': 'report',
-        'review-book': 'review-book',
-        'review': 'review',
-        'song': 'song',
-        'speech': 'speech',
-        'thesis': 'thesis',
-        'treaty': 'treaty',
-        'webpage': 'webpage',
+    "citeproc": {
+        "article": "article",
+        "article-journal": "article-journal",
+        "article-magazine": "article-magazine",
+        "article-newspaper": "article-newspaper",
+        "bill": "bill",
+        "book": "book",
+        "broadcast": "broadcast",
+        "chapter": "chapter",
+        "dataset": "dataset",
+        "entry-dictionary": "entry-dictionary",
+        "entry-encyclopedia": "entry-encyclopedia",
+        "entry": "entry",
+        "figure": "figure",
+        "graphic": "graphic",
+        "interview": "interview",
+        "legal_case": "legal_case",
+        "legislation": "legislation",
+        "manuscript": "manuscript",
+        "map": "map",
+        "motion_picture": "motion_picture",
+        "musical_score": "musical_score",
+        "pamphlet": "pamphlet",
+        "paper-conference": "paper-conference",
+        "patent": "patent",
+        "personal_communication": "personal_communication",
+        "post": "post",
+        "post-weblog": "post-weblog",
+        "report": "report",
+        "review-book": "review-book",
+        "review": "review",
+        "song": "song",
+        "speech": "speech",
+        "thesis": "thesis",
+        "treaty": "treaty",
+        "webpage": "webpage",
     },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
-    'bibtex': {
-        'phdthesis': 'thesis',
-        'inbook': 'chapter',
-        'misc': None,
-        'article': 'article-journal',
-        'book': 'book',
+    "bibtex": {
+        "phdthesis": "thesis",
+        "inbook": "chapter",
+        "misc": None,
+        "article": "article-journal",
+        "book": "book",
     },
-    'resourceTypeGeneral': {
-        'Image': 'graphic',
-        'Dataset': 'dataset',
-        'PhysicalObject': None,
-        'Collection': None,
-        'Text': None, # "Greyliterature, labnotes, accompanyingmaterials"
-        'Sound': None,
-        'InteractiveResource': None,
-        'Event': None,
-        'Software': 'software',
-        'Other': None,
-        'Workflow': None,
-        'Audiovisual': None,
-    } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+    "resourceTypeGeneral": {
+        "Image": "graphic",
+        "Dataset": "dataset",
+        "PhysicalObject": None,
+        "Collection": None,
+        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials"
+        "Sound": None,
+        "InteractiveResource": None,
+        "Event": None,
+        "Software": "software",
+        "Other": None,
+        "Workflow": None,
+        "Audiovisual": None,
+    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
 }
 
 # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
 DATACITE_UNKNOWN_MARKERS = (
-    '(:unac)',  # temporarily inaccessible
-    '(:unal)',  # unallowed, suppressed intentionally
-    '(:unap)',  # not applicable, makes no sense
-    '(:unas)',  # value unassigned (e.g., Untitled)
-    '(:unav)',  # value unavailable, possibly unknown
-    '(:unkn)',  # known to be unknown (e.g., Anonymous, Inconnue)
-    '(:none)',  # never had a value, never will
-    '(:null)',  # explicitly and meaningfully empty
-    '(:tba)',  # to be assigned or announced later
-    '(:etal)',  # too numerous to list (et alia)
+    "(:unac)",  # temporarily inaccessible
+    "(:unal)",  # unallowed, suppressed intentionally
+    "(:unap)",  # not applicable, makes no sense
+    "(:unas)",  # value unassigned (e.g., Untitled)
+    "(:unav)",  # value unavailable, possibly unknown
+    "(:unkn)",  # known to be unknown (e.g., Anonymous, Inconnue)
+    "(:none)",  # never had a value, never will
+    "(:null)",  # explicitly and meaningfully empty
+    "(:tba)",  # to be assigned or announced later
+    "(:etal)",  # too numerous to list (et alia)
 )
 
 # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking
 # unknown values.
-UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
-    'NA',
-    'NN',
-    'n.a.',
-    '[s.n.]',
-    'Unknown',
-)))
+UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(
+    set(
+        (
+            "NA",
+            "NN",
+            "n.a.",
+            "[s.n.]",
+            "Unknown",
+        )
+    )
+)
 
 # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist.
 UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
@@ -157,8 +161,20 @@ UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
 # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
 DATACITE_TITLE_SPAM_WORDGROUPS = [
     {
-        "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online',
-                   'free', 'hd', 'download', 'english', 'subtitle', 'bluray'),
+        "tokens": (
+            "full",
+            "movies",
+            "movie",
+            "watch",
+            "streaming",
+            "online",
+            "free",
+            "hd",
+            "download",
+            "english",
+            "subtitle",
+            "bluray",
+        ),
         "min": 4,
     }
 ]
@@ -205,28 +221,25 @@ class DataciteImporter(EntityImporter):
     """
     Importer for datacite records.
     """
-    def __init__(self,
-                 api,
-                 issn_map_file,
-                 debug=False,
-                 insert_log_file=None,
-                 **kwargs):
+
+    def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs):
 
         eg_desc = kwargs.get(
-            'editgroup_description',
-            "Automated import of Datacite DOI metadata, harvested from REST API"
+            "editgroup_description",
+            "Automated import of Datacite DOI metadata, harvested from REST API",
         )
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent',
-                                         'fatcat_tools.DataciteImporter')
-        super().__init__(api,
-                         issn_map_file=issn_map_file,
-                         editgroup_description=eg_desc,
-                         editgroup_extra=eg_extra,
-                         **kwargs)
-
-        self.create_containers = kwargs.get('create_containers', True)
-        extid_map_file = kwargs.get('extid_map_file')
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DataciteImporter")
+        super().__init__(
+            api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs
+        )
+
+        self.create_containers = kwargs.get("create_containers", True)
+        extid_map_file = kwargs.get("extid_map_file")
         self.extid_map_db = None
         if extid_map_file:
             db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -240,30 +253,34 @@ class DataciteImporter(EntityImporter):
         self.insert_log_file = insert_log_file
         self.this_year = datetime.datetime.now().year
 
-        print('datacite with debug={}'.format(self.debug), file=sys.stderr)
+        print("datacite with debug={}".format(self.debug), file=sys.stderr)
 
     def lookup_ext_ids(self, doi):
         """
         Return dictionary of identifiers referring to the same things as the given DOI.
         """
         if self.extid_map_db is None:
-            return dict(core_id=None,
-                        pmid=None,
-                        pmcid=None,
-                        wikidata_qid=None,
-                        arxiv_id=None,
-                        jstor_id=None)
+            return dict(
+                core_id=None,
+                pmid=None,
+                pmcid=None,
+                wikidata_qid=None,
+                arxiv_id=None,
+                jstor_id=None,
+            )
         row = self.extid_map_db.execute(
-            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
-            [doi.lower()]).fetchone()
+            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+        ).fetchone()
         if row is None:
-            return dict(core_id=None,
-                        pmid=None,
-                        pmcid=None,
-                        wikidata_qid=None,
-                        arxiv_id=None,
-                        jstor_id=None)
-        row = [str(cell or '') or None for cell in row]
+            return dict(
+                core_id=None,
+                pmid=None,
+                pmcid=None,
+                wikidata_qid=None,
+                arxiv_id=None,
+                jstor_id=None,
+            )
+        row = [str(cell or "") or None for cell in row]
         return dict(
             core_id=row[0],
             pmid=row[1],
@@ -280,22 +297,22 @@ class DataciteImporter(EntityImporter):
         """
         if not obj or not isinstance(obj, dict):
             return None
-        if 'attributes' not in obj:
+        if "attributes" not in obj:
             return None
 
-        attributes = obj['attributes']
-        doi = clean_doi(attributes.get('doi', '').lower())
+        attributes = obj["attributes"]
+        doi = clean_doi(attributes.get("doi", "").lower())
 
         if not doi:
-            print('skipping record without a DOI', file=sys.stderr)
+            print("skipping record without a DOI", file=sys.stderr)
             return
 
         if not str.isascii(doi):
-            print('[{}] skipping non-ascii doi for now'.format(doi))
+            print("[{}] skipping non-ascii doi for now".format(doi))
             return None
 
-        creators = attributes.get('creators', []) or []
-        contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
+        creators = attributes.get("creators", []) or []
+        contributors = attributes.get("contributors", []) or []  # Much fewer than creators.
 
         contribs = self.parse_datacite_creators(creators, doi=doi)
 
@@ -323,7 +340,9 @@ class DataciteImporter(EntityImporter):
         # Related: https://guide.fatcat.wiki/entity_release.html -- role
         # (string, of a set): the type of contribution, from a controlled
         # vocabulary. TODO: vocabulary needs review.
-        contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
+        contribs_extra_contributors = self.parse_datacite_creators(
+            contributors, set_index=False, doi=doi
+        )
 
         # Unfortunately, creators and contributors might overlap, refs GH59.
         for cc in contribs_extra_contributors:
@@ -333,17 +352,16 @@ class DataciteImporter(EntityImporter):
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
-        titles = attributes.get('titles', []) or []
-        title, original_language_title, subtitle = parse_datacite_titles(
-            titles)
+        titles = attributes.get("titles", []) or []
+        title, original_language_title, subtitle = parse_datacite_titles(titles)
 
         if title is None:
-            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+            print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
 
         title = clean(title)
         if not title:
-            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+            print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
             return False
 
         # check for blocklisted "spam", e.g. "FULL MOVIE"
@@ -367,10 +385,13 @@ class DataciteImporter(EntityImporter):
         # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
         # "Updated", "Valid".
         release_date, release_month, release_year = parse_datacite_dates(
-            attributes.get('dates', []))
+            attributes.get("dates", [])
+        )
 
         # block bogus far-future years/dates
-        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+        if release_year is not None and (
+            release_year > (self.this_year + 5) or release_year < 1000
+        ):
             release_date = None
             release_month = None
             release_year = None
@@ -378,26 +399,30 @@ class DataciteImporter(EntityImporter):
         # Some records do not use the "dates" field (e.g. micropub), but:
         # "attributes.published" or "attributes.publicationYear"
         if not any((release_date, release_month, release_year)):
-            release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+            release_date, release_month, release_year = parse_single_date(
+                attributes.get("publicationYear")
+            )
             if not any((release_date, release_month, release_year)):
-                release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+                release_date, release_month, release_year = parse_single_date(
+                    attributes.get("published")
+                )
 
         if not any((release_date, release_month, release_year)):
-            print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)
+            print("[{}] record w/o date: {}".format(doi, obj), file=sys.stderr)
 
         # Start with clear stages, e.g. published. TODO(martin): we could
         # probably infer a bit more from the relations, e.g.
         # "IsPreviousVersionOf" or "IsNewVersionOf".
-        release_stage = 'published'
+        release_stage = "published"
 
         # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
         # we might want something else than 'published'. See also:
         # https://support.datacite.org/docs/doi-states.
 
         # Publisher. A few NA values. A few bogus values.
-        publisher = attributes.get('publisher')
+        publisher = attributes.get("publisher")
 
-        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):
+        if publisher in UNKNOWN_MARKERS | set(("Unpublished", "Unknown")):
             publisher = None
             release_stage = None
         if publisher is not None and len(publisher) > 80:
@@ -416,24 +441,26 @@ class DataciteImporter(EntityImporter):
         container_id = None
         container_name = None
 
-        container = attributes.get('container', {}) or {}
-        if container.get('type') in CONTAINER_TYPE_MAP.keys():
-            container_type = CONTAINER_TYPE_MAP.get(container['type'])
-            if container.get('identifier') and container.get(
-                    'identifierType') == 'ISSN':
-                issn = container.get('identifier')
+        container = attributes.get("container", {}) or {}
+        if container.get("type") in CONTAINER_TYPE_MAP.keys():
+            container_type = CONTAINER_TYPE_MAP.get(container["type"])
+            if container.get("identifier") and container.get("identifierType") == "ISSN":
+                issn = container.get("identifier")
                 if len(issn) == 8:
                     issn = issn[:4] + "-" + issn[4:]
                 issnl = self.issn2issnl(issn)
                 if issnl is not None:
                     container_id = self.lookup_issnl(issnl)
 
-                    if container_id is None and container.get('title'):
-                        container_name = container.get('title')
+                    if container_id is None and container.get("title"):
+                        container_name = container.get("title")
                         if isinstance(container_name, list):
                             if len(container_name) > 0:
-                                print('[{}] too many container titles: {}'.format(doi,
-                                    len(container_name)))
+                                print(
+                                    "[{}] too many container titles: {}".format(
+                                        doi, len(container_name)
+                                    )
+                                )
                                 container_name = container_name[0]
                         assert isinstance(container_name, str)
                         ce = fatcat_openapi_client.ContainerEntity(
@@ -447,21 +474,24 @@ class DataciteImporter(EntityImporter):
                 else:
                     # TODO(martin): factor this out into a testable function.
                     # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013
-                    container_name = container.get('title')
+                    container_name = container.get("title")
                     if isinstance(container_name, list):
                         if len(container_name) > 0:
-                            print('[{}] too many container titles: {}'.format(doi,
-                                len(container_name)))
+                            print(
+                                "[{}] too many container titles: {}".format(
+                                    doi, len(container_name)
+                                )
+                            )
                             container_name = container_name[0]
 
         # Exception: https://www.micropublication.org/, see: !MR24.
         if container_id is None and container_name is None:
-            if publisher and publisher.lower().startswith('micropublication'):
+            if publisher and publisher.lower().startswith("micropublication"):
                 container_name = publisher
 
         # Volume and issue.
-        volume = container.get('volume')
-        issue = container.get('issue')
+        volume = container.get("volume")
+        issue = container.get("issue")
 
         if volume:
             volume = clean(volume)
@@ -472,13 +502,13 @@ class DataciteImporter(EntityImporter):
         # Pages.
         pages = None
 
-        first_page = container.get('firstPage')
-        last_page = container.get('lastPage')
+        first_page = container.get("firstPage")
+        last_page = container.get("lastPage")
 
         if first_page and last_page:
             try:
                 _ = int(first_page) < int(last_page)
-                pages = '{}-{}'.format(first_page, last_page)
+                pages = "{}-{}".format(first_page, last_page)
             except ValueError as err:  # noqa: F841
                 # TODO(martin): This is more debug than info.
                 # print('[{}] {}'.format(doi, err), file=sys.stderr)
@@ -491,8 +521,8 @@ class DataciteImporter(EntityImporter):
         license_slug = None
         license_extra = []
 
-        for lic in attributes.get('rightsList', []):
-            slug = lookup_license_slug(lic.get('rightsUri'))
+        for lic in attributes.get("rightsList", []):
+            slug = lookup_license_slug(lic.get("rightsUri"))
             if slug:
                 license_slug = slug
             license_extra.append(lic)
@@ -506,7 +536,7 @@ class DataciteImporter(EntityImporter):
         # library solves it for you." -- TODO(martin): We need more of these.
         language = None
 
-        value = attributes.get('language', '') or ''
+        value = attributes.get("language", "") or ""
         try:
             language = pycountry.languages.lookup(value).alpha_2
         except (LookupError, AttributeError) as err:  # noqa: F841
@@ -520,22 +550,22 @@ class DataciteImporter(EntityImporter):
         # "Other" fields might contain references or related articles (with
         # DOI). TODO(martin): maybe try to parse out some of those refs.
         abstracts = []
-        descs = attributes.get('descriptions', []) or []
+        descs = attributes.get("descriptions", []) or []
         for desc in descs:
-            if not desc.get('descriptionType') == 'Abstract':
+            if not desc.get("descriptionType") == "Abstract":
                 continue
 
             # Description maybe a string, int or list.
-            text = desc.get('description', '')
+            text = desc.get("description", "")
             if not text:
                 continue
             if isinstance(text, int):
-                text = '{}'.format(text)
+                text = "{}".format(text)
             if isinstance(text, list):
                 try:
                     text = "\n".join(text)
                 except TypeError:
-                    continue # Bail out, if it is not a list of strings.
+                    continue  # Bail out, if it is not a list of strings.
 
             # Limit length.
             if len(text) < 10:
@@ -548,7 +578,10 @@ class DataciteImporter(EntityImporter):
             try:
                 lang = langdetect.detect(text)
             except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err:
-                print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr)
+                print(
+                    "[{}] language detection failed with {} on {}".format(doi, err, text),
+                    file=sys.stderr,
+                )
             abstract_text = clean(text)
             if not abstract_text:
                 continue
@@ -557,7 +590,8 @@ class DataciteImporter(EntityImporter):
                     mimetype="text/plain",
                     content=abstract_text,
                     lang=lang,
-                ))
+                )
+            )
 
         # References and relations. Datacite include many relation types in
         # "attributes.relatedIdentifiers[].relationType", e.g.
@@ -570,67 +604,76 @@ class DataciteImporter(EntityImporter):
         # For the moment, we only care about References.
         refs, ref_index = [], 0
 
-        relIds = attributes.get('relatedIdentifiers', []) or []
+        relIds = attributes.get("relatedIdentifiers", []) or []
         for rel in relIds:
-            if not rel.get('relationType', '') in ('References', 'Cites'):
+            if not rel.get("relationType", "") in ("References", "Cites"):
                 continue
             ref_extra = dict()
-            if rel.get('relatedIdentifierType', '') == 'DOI':
-                ref_extra['doi'] = rel.get('relatedIdentifier')
+            if rel.get("relatedIdentifierType", "") == "DOI":
+                ref_extra["doi"] = rel.get("relatedIdentifier")
             if not ref_extra:
                 ref_extra = None
             refs.append(
                 fatcat_openapi_client.ReleaseRef(
                     index=ref_index,
                     extra=ref_extra,
-                ))
+                )
+            )
             ref_index += 1
 
         # More specific release_type via 'Reviews' relationsship.
         for rel in relIds:
-            if rel.get('relatedIdentifierType', '') != 'Reviews':
+            if rel.get("relatedIdentifierType", "") != "Reviews":
                 continue
-            release_type = 'review'
+            release_type = "review"
 
         # Extra information.
         extra_datacite = dict()
 
         if license_extra:
-            extra_datacite['license'] = license_extra
-        if attributes.get('subjects'):
-            extra_datacite['subjects'] = attributes['subjects']
+            extra_datacite["license"] = license_extra
+        if attributes.get("subjects"):
+            extra_datacite["subjects"] = attributes["subjects"]
 
         # Include version information.
-        metadata_version = attributes.get('metadataVersion') or ''
+        metadata_version = attributes.get("metadataVersion") or ""
 
         if metadata_version:
-            extra_datacite['metadataVersion'] = metadata_version
+            extra_datacite["metadataVersion"] = metadata_version
 
         # Include resource types.
-        types = attributes.get('types', {}) or {}
-        resource_type = types.get('resourceType', '') or ''
-        resource_type_general = types.get('resourceTypeGeneral', '') or ''
+        types = attributes.get("types", {}) or {}
+        resource_type = types.get("resourceType", "") or ""
+        resource_type_general = types.get("resourceTypeGeneral", "") or ""
 
         if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER:
-            extra_datacite['resourceType'] = resource_type
+            extra_datacite["resourceType"] = resource_type
         if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER:
-            extra_datacite['resourceTypeGeneral'] = resource_type_general
+            extra_datacite["resourceTypeGeneral"] = resource_type_general
 
         # Include certain relations from relatedIdentifiers. Keeping the
         # original structure of data here, which is a list of dicts, with
         # relation type, identifier and identifier type (mostly).
         relations = []
         for rel in relIds:
-            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
-                                           'IsVariantFormOf', 'IsSupplementTo',
-                                           'HasVersion', 'IsMetadataFor',
-                                           'IsNewVersionOf', 'IsIdenticalTo',
-                                           'IsVersionOf', 'IsDerivedFrom',
-                                           'IsSourceOf'):
+            if rel.get("relationType") in (
+                "IsPartOf",
+                "Reviews",
+                "Continues",
+                "IsVariantFormOf",
+                "IsSupplementTo",
+                "HasVersion",
+                "IsMetadataFor",
+                "IsNewVersionOf",
+                "IsIdenticalTo",
+                "IsVersionOf",
+                "IsDerivedFrom",
+                "IsSourceOf",
+            ):
                 relations.append(rel)
 
         if relations:
-            extra_datacite['relations'] = relations
+            extra_datacite["relations"] = relations
 
         extra = dict()
 
@@ -640,18 +683,18 @@ class DataciteImporter(EntityImporter):
         # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,
         # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",
         # "10161", "10010691", "10780", # "Presentación"
-        version = attributes.get('version') or None
+        version = attributes.get("version") or None
 
         # top-level extra keys
         if not container_id and container_name:
-            extra['container_name'] = container_name
+            extra["container_name"] = container_name
 
         # Always include datacite key, even if value is empty (dict).
-        extra['datacite'] = extra_datacite
+        extra["datacite"] = extra_datacite
 
         # Preparation for a schema update.
         if release_month:
-            extra['release_month'] = release_month
+            extra["release_month"] = release_month
 
         extids = self.lookup_ext_ids(doi=doi)
 
@@ -669,12 +712,12 @@ class DataciteImporter(EntityImporter):
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
                 doi=doi,
-                pmid=extids['pmid'],
-                pmcid=extids['pmcid'],
-                wikidata_qid=extids['wikidata_qid'],
-                core=extids['core_id'],
-                arxiv=extids['arxiv_id'],
-                jstor=extids['jstor_id'],
+                pmid=extids["pmid"],
+                pmcid=extids["pmcid"],
+                wikidata_qid=extids["wikidata_qid"],
+                core=extids["core_id"],
+                arxiv=extids["arxiv_id"],
+                jstor=extids["jstor_id"],
             ),
             contribs=contribs,
             volume=volume,
@@ -702,19 +745,19 @@ class DataciteImporter(EntityImporter):
         """
 
         release_type = None
-        if not attributes.get('types'):
+        if not attributes.get("types"):
             return None
-        types = attributes['types']
+        types = attributes["types"]
 
-        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
+        for typeType in ("citeproc", "ris", "schemaOrg", "bibtex", "resourceTypeGeneral"):
             value = types.get(typeType)
             release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
             if release_type is not None:
                 break
 
         # special case: figshare "collections" which group other entities
-        if doi.startswith('10.6084/') or doi.startswith('10.25384'):
-            if types.get('resourceType') == "Collection":
+        if doi.startswith("10.6084/") or doi.startswith("10.25384"):
+            if types.get("resourceType") == "Collection":
                 release_type = "stub"
 
         if release_type is None:
@@ -736,35 +779,41 @@ class DataciteImporter(EntityImporter):
         # publishes highly interesting datasets, but titles are mostly the same
         # ("GBIF Occurrence Download" or "Occurrence Download"); set
         # release_type to "stub" (CSL/FC).
-        if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'):
-            re.release_type = 'stub'
+        if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith("10.15468/dl."):
+            re.release_type = "stub"
 
         # release_type exception: lots of "Experimental Crystal Structure Determination"
         # publisher: "Cambridge Crystallographic Data Centre"
-        if re.ext_ids.doi.startswith('10.5517/'):
-            re.release_type = 'entry'
+        if re.ext_ids.doi.startswith("10.5517/"):
+            re.release_type = "entry"
 
         # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
-        if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):
-            re.release_type = 'component'
+        if re.title.lower().startswith("additional file") and re.release_type in (
+            "article",
+            "article-journal",
+        ):
+            re.release_type = "component"
 
         # figshare
-        if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'):
+        if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith("10.25384"):
             # set version if DOI ends with versioned suffix
-            doi_suffix = re.ext_ids.doi.split('.')[-1]
-            if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit():
+            doi_suffix = re.ext_ids.doi.split(".")[-1]
+            if doi_suffix and doi_suffix.startswith("v") and doi_suffix[1:].isdigit():
                 re.version = doi_suffix
             # "Figure 123 from " -> component
             # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean"
-            if " from " in re.title and re.release_type not in ('stub', 'graphic'):
+            if " from " in re.title and re.release_type not in ("stub", "graphic"):
                 if re.title.startswith("Figure "):
                     re.release_type = "component"
                 elif re.title.startswith("Table "):
                     re.release_type = "component"
 
         # figshare.com
-        if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None:
-            re.extra['container_name'] = "figshare.com"
+        if (
+            re.ext_ids.doi.startswith("10.6084/m9.figshare.")
+            and re.extra.get("container_name") is None
+        ):
+            re.extra["container_name"] = "figshare.com"
 
         return re
 
@@ -788,26 +837,28 @@ class DataciteImporter(EntityImporter):
         # eventually we'll want to support "updates", but for now just skip if
         # entity already exists
         if existing:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         return True
 
     def insert_batch(self, batch):
-        print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+        print("inserting batch ({})".format(len(batch)), file=sys.stderr)
         if self.insert_log_file:
-            with open(self.insert_log_file, 'a') as f:
+            with open(self.insert_log_file, "a") as f:
                 for doc in batch:
                     json.dump(entity_to_dict(doc, api_client=None), f)
-                    f.write('\n')
+                    f.write("\n")
         self.api.create_release_auto_batch(
             fatcat_openapi_client.ReleaseAutoBatch(
                 editgroup=fatcat_openapi_client.Editgroup(
-                    description=self.editgroup_description,
-                    extra=self.editgroup_extra),
-                entity_list=batch))
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
 
-    def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None):
+    def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None):
         """
         Parses a list of creators into a list of ReleaseContrib objects. Set
         set_index to False, if the index contrib field should be left blank.
@@ -820,48 +871,53 @@ class DataciteImporter(EntityImporter):
         contribs = []
 
         # Names, that should be ignored right away.
-        name_blocklist = set(('Occdownload Gbif.Org',))
+        name_blocklist = set(("Occdownload Gbif.Org",))
 
         i = 0
         for c in creators:
             if not set_index:
                 i = None
-            nameType = c.get('nameType', '') or ''
-            if nameType in ('', 'Personal'):
+            nameType = c.get("nameType", "") or ""
+            if nameType in ("", "Personal"):
                 creator_id = None
-                for nid in c.get('nameIdentifiers', []) or []:
+                for nid in c.get("nameIdentifiers", []) or []:
                     if not isinstance(nid, dict):
                         # see: fatcat-workers/issues/44035/
-                        print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr)
+                        print(
+                            "unexpected nameIdentifiers, expected list of dicts, got: {}".format(
+                                nid
+                            ),
+                            file=sys.stderr,
+                        )
                         continue
-                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
+                    name_scheme = nid.get("nameIdentifierScheme", "") or ""
                     if not name_scheme.lower() == "orcid":
                         continue
-                    orcid = nid.get('nameIdentifier') or ''
-                    orcid = orcid.replace('https://orcid.org/', '')
+                    orcid = nid.get("nameIdentifier") or ""
+                    orcid = orcid.replace("https://orcid.org/", "")
                     if not orcid:
                         continue
                     creator_id = self.lookup_orcid(orcid)
                     # TODO(martin): If creator_id is None, should we create creators?
 
                 # If there are multiple affiliation strings, use the first one.
-                affiliations = c.get('affiliation', []) or []
+                affiliations = c.get("affiliation", []) or []
                 raw_affiliation = None
                 if len(affiliations) == 0:
                     raw_affiliation = None
                 else:
                     raw_affiliation = clean(affiliations[0])
 
-                name = c.get('name')
-                given_name = c.get('givenName')
-                surname = c.get('familyName')
+                name = c.get("name")
+                given_name = c.get("givenName")
+                surname = c.get("familyName")
 
                 if name:
                     name = clean(name)
                 if not any((name, given_name, surname)):
                     continue
                 if not name:
-                    name = "{} {}".format(given_name or '', surname or '').strip()
+                    name = "{} {}".format(given_name or "", surname or "").strip()
                 if name in name_blocklist:
                     continue
                 if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -881,7 +937,7 @@ class DataciteImporter(EntityImporter):
                 if not name:
                     continue
 
-                if raw_affiliation == '':
+                if raw_affiliation == "":
                     continue
 
                 extra = None
@@ -891,39 +947,38 @@ class DataciteImporter(EntityImporter):
                 # "RelatedPerson", "ProjectLeader", "Editor", "Other",
                 # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
                 # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
-                contributorType = c.get('contributorType', '') or ''
+                contributorType = c.get("contributorType", "") or ""
 
                 if contributorType:
-                    extra = {'type': contributorType}
+                    extra = {"type": contributorType}
 
                 rc = fatcat_openapi_client.ReleaseContrib(
-                        creator_id=creator_id,
-                        index=i,
-                        raw_name=name,
-                        given_name=given_name,
-                        surname=surname,
-                        role=role,
-                        raw_affiliation=raw_affiliation,
-                        extra=extra,
-                    )
+                    creator_id=creator_id,
+                    index=i,
+                    raw_name=name,
+                    given_name=given_name,
+                    surname=surname,
+                    role=role,
+                    raw_affiliation=raw_affiliation,
+                    extra=extra,
+                )
                 # Filter out duplicates early.
                 if not contributor_list_contains_contributor(contribs, rc):
                     contribs.append(rc)
                     if i is not None:
                         i += 1
-            elif nameType == 'Organizational':
-                name = c.get('name', '') or ''
+            elif nameType == "Organizational":
+                name = c.get("name", "") or ""
                 if name in UNKNOWN_MARKERS:
                     continue
                 if len(name) < 3:
                     continue
-                extra = {'organization': name}
-                contribs.append(fatcat_openapi_client.ReleaseContrib(
-                    index=i, extra=extra))
+                extra = {"organization": name}
+                contribs.append(fatcat_openapi_client.ReleaseContrib(index=i, extra=extra))
                 if i is not None:
                     i += 1
             else:
-                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+                print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr)
 
         return contribs
 
@@ -935,8 +990,8 @@ def contributor_list_contains_contributor(contributor_list, contributor):
     for cc in contributor_list:
         if cc.raw_name != contributor.raw_name:
             continue
-        cc_role = cc.role or 'author'
-        contributor_role = contributor.role or 'author'
+        cc_role = cc.role or "author"
+        contributor_role = contributor.role or "author"
         if cc_role != contributor_role:
             continue
         return True
@@ -952,91 +1007,97 @@ def lookup_license_slug(raw):
     if not raw:
         return None
 
-    if 'creativecommons.org/publicdomain/zero' in raw:
-        return 'CC-0'
-    if raw.lower().endswith('/cc0'):
-        return 'CC-0'
+    if "creativecommons.org/publicdomain/zero" in raw:
+        return "CC-0"
+    if raw.lower().endswith("/cc0"):
+        return "CC-0"
 
-    if 'creativecommons' in raw:
+    if "creativecommons" in raw:
         # https://creativecommons.org/publicdomain/mark/1.0/deed.de
-        if 'creativecommons.org/publicdomain' in raw:
-            return 'CC-PUBLICDOMAIN'
-        if 'creativecommons.org/share-your-work/public-domain/cc0' in raw:
-            return 'CC-0'
+        if "creativecommons.org/publicdomain" in raw:
+            return "CC-PUBLICDOMAIN"
+        if "creativecommons.org/share-your-work/public-domain/cc0" in raw:
+            return "CC-0"
         # https://creativecommons.org/licenses/by/4.0/deed.es_ES
         raw = raw.lower()
-        match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE)
+        match = re.search(
+            r"creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)", raw, re.IGNORECASE
+        )
         if not match:
-            print('missed potential license: {}'.format(raw), file=sys.stderr)
+            print("missed potential license: {}".format(raw), file=sys.stderr)
             return None
-        name = match.groupdict().get('name')
+        name = match.groupdict().get("name")
         if not name:
             return None
-        if not name.startswith('cc'):
-            name = 'cc-{}'.format(name)
+        if not name.startswith("cc"):
+            name = "cc-{}".format(name)
         return name.upper()
 
-    if 'opensource.org' in raw:
+    if "opensource.org" in raw:
         # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2
-        match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE)
+        match = re.search(r"opensource.org/licenses/(?P<name>[^/]+)", raw, re.IGNORECASE)
         if not match:
-            print('missed potential license: {}'.format(raw), file=sys.stderr)
+            print("missed potential license: {}".format(raw), file=sys.stderr)
             return None
-        name = match.groupdict().get('name')
+        name = match.groupdict().get("name")
         if not name:
             return None
         if len(name) > 11:
             return None
         return name.upper()
 
-    if 'gnu.org' in raw:
+    if "gnu.org" in raw:
         # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
-        match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE)
+        match = re.search(
+            r"/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)",
+            raw,
+            re.IGNORECASE,
+        )
         if not match:
-            print('missed potential license: {}'.format(raw), file=sys.stderr)
+            print("missed potential license: {}".format(raw), file=sys.stderr)
             return None
-        name = match.groupdict().get('name')
+        name = match.groupdict().get("name")
         if not name:
             return None
         if len(name) > 8:
             return None
         return name.upper()
 
-    if 'spdx.org' in raw:
-        if 'spdx.org/licenses/CC0' in raw:
-            return 'CC-0'
+    if "spdx.org" in raw:
+        if "spdx.org/licenses/CC0" in raw:
+            return "CC-0"
         # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html
-        match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE)
+        match = re.search(r"spdx.org/licenses/(?P<name>[a-z0-9-]+)", raw, re.IGNORECASE)
         if not match:
-            print('missed potential license: {}'.format(raw), file=sys.stderr)
+            print("missed potential license: {}".format(raw), file=sys.stderr)
             return None
-        name = match.groupdict().get('name')
+        name = match.groupdict().get("name")
         if not name:
             return None
         if len(name) > 36:
             return None
         # cleanup version and extensions
-        name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower())
+        name = re.sub("(-[0-9])?[.]?[0-9]?(.json|.html)?", "", name.lower())
         return name.upper()
 
-    if 'rightsstatements.org' in raw:
+    if "rightsstatements.org" in raw:
         # http://rightsstatements.org/vocab/InC/1.0/
-        match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw)
+        match = re.search(r"rightsstatements.org/(vocab|page)/(?P<name>[^/]*)", raw)
         if not match:
-            print('missed potential license: {}'.format(raw), file=sys.stderr)
+            print("missed potential license: {}".format(raw), file=sys.stderr)
             return None
-        name = match.groupdict().get('name')
+        name = match.groupdict().get("name")
         if not name:
             return None
         if len(name) > 9:
             return None
-        return 'RS-{}'.format(name.upper())
+        return "RS-{}".format(name.upper())
 
     # Fallback to mapped values.
     raw = raw.lower()
-    raw = raw.strip().replace('http://', '//').replace('https://', '//')
-    if not raw.endswith('/'):
-        raw = raw + '/'
+    raw = raw.strip().replace("http://", "//").replace("https://", "//")
+    if not raw.endswith("/"):
+        raw = raw + "/"
     return LICENSE_SLUG_MAP.get(raw)
 
 
@@ -1046,23 +1107,21 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3):
 
     Example input: {'title': 'Some title', 'original_language_title': 'Some title'}
     """
-    if 'original_language_title' not in item:
+    if "original_language_title" not in item:
         return None
-    title = item.get('title')
+    title = item.get("title")
     if not title:
         return None
-    original_language_title = item.get('original_language_title')
-    if isinstance(original_language_title,
-                  str) and title != original_language_title:
+    original_language_title = item.get("original_language_title")
+    if isinstance(original_language_title, str) and title != original_language_title:
         if len(original_language_title) < min_length:
             return None
-        if original_language_title.count('?') > max_questionmarks:
+        if original_language_title.count("?") > max_questionmarks:
             return None
         return original_language_title
     if isinstance(original_language_title, dict):
-        content = original_language_title.get('__content__', '') or ''
-        if content and content != title and not content.count(
-                '?') > max_questionmarks:
+        content = original_language_title.get("__content__", "") or ""
+        if content and content != title and not content.count("?") > max_questionmarks:
             return content
     return None
 
@@ -1082,23 +1141,23 @@ def parse_datacite_titles(titles):
         return title, original_language_title, subtitle
     elif len(titles) == 1:
         original_language_title = find_original_language_title(titles[0])
-        title = titles[0].get('title', '') or ''
+        title = titles[0].get("title", "") or ""
         title = title.strip()
         if not title:
             title = None
         return title, original_language_title, subtitle
     else:
         for entry in titles:
-            if not title and ('titleType' not in entry
-                              or not entry.get('titleType')):
-                title = (entry.get('title') or '').strip()
-            if not subtitle and entry.get('titleType') == 'Subtitle':
-                subtitle = entry.get('title', '').strip()
+            if not title and ("titleType" not in entry or not entry.get("titleType")):
+                title = (entry.get("title") or "").strip()
+            if not subtitle and entry.get("titleType") == "Subtitle":
+                subtitle = entry.get("title", "").strip()
             if not original_language_title:
                 original_language_title = find_original_language_title(entry)
 
     return title, original_language_title, subtitle
 
+
 def parse_single_date(value):
     """
     Given a single string containing a date in arbitrary format, try to return
@@ -1113,11 +1172,11 @@ def parse_single_date(value):
         # Results in a dict with keys: date_obj, period, locale.
         parse_result = parser.get_date_data(value)
         # A datetime object, later we need a date, only.
-        result = parse_result['date_obj']
+        result = parse_result["date_obj"]
         if result is not None:
-            if parse_result['period'] == 'year':
+            if parse_result["period"] == "year":
                 return None, None, result.year
-            elif parse_result['period'] == 'month':
+            elif parse_result["period"] == "month":
                 return None, result.month, result.year
             else:
                 return result.date(), result.month, result.year
@@ -1126,6 +1185,7 @@ def parse_single_date(value):
 
     return None, None, None
 
+
 def parse_datacite_dates(dates):
     """
     Given a list of date fields (under .dates), return tuple, (release_date,
@@ -1137,37 +1197,37 @@ def parse_datacite_dates(dates):
         return release_date, release_month, release_year
 
     if not isinstance(dates, list):
-        raise ValueError('expected a list of date items')
+        raise ValueError("expected a list of date items")
 
     # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted",
     # "Collected", "Updated", "Copyrighted", "Created"
     # Ignored for now: "Collected", "Issued"
     date_type_prio = (
-        'Valid',
-        'Available',
-        'Accepted',
-        'Submitted',
-        'Copyrighted',
-        'Created',
-        'Updated',
+        "Valid",
+        "Available",
+        "Accepted",
+        "Submitted",
+        "Copyrighted",
+        "Created",
+        "Updated",
     )
 
     # We need to note the granularity, since a string like "2019" would be
     # parsed into "2019-01-01", even though the month is unknown. Use 3
     # granularity types: 'y', 'm', 'd'.
-    Pattern = collections.namedtuple('Pattern', 'layout granularity')
+    Pattern = collections.namedtuple("Pattern", "layout granularity")
 
     # Before using (expensive) dateparser, try a few common patterns.
     common_patterns = (
-        Pattern('%Y-%m-%d', 'd'),
-        Pattern('%Y-%m', 'm'),
-        Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'),
-        Pattern('%Y-%m-%dT%H:%M:%S', 'd'),
-        Pattern('%Y', 'y'),
+        Pattern("%Y-%m-%d", "d"),
+        Pattern("%Y-%m", "m"),
+        Pattern("%Y-%m-%dT%H:%M:%SZ", "d"),
+        Pattern("%Y-%m-%dT%H:%M:%S", "d"),
+        Pattern("%Y", "y"),
     )
 
     def parse_item(item):
-        result, value, year_only = None, str(item.get('date', '')) or '', False
+        result, value, year_only = None, str(item.get("date", "")) or "", False
         release_date, release_month, release_year = None, None, None
 
         for layout, granularity in common_patterns:
@@ -1176,22 +1236,22 @@ def parse_datacite_dates(dates):
             except ValueError:
                 continue
             else:
-                if granularity == 'y':
+                if granularity == "y":
                     year_only = True
                 break
 
         if result is None:
-            print('fallback for {}'.format(value), file=sys.stderr)
+            print("fallback for {}".format(value), file=sys.stderr)
             release_date, release_month, release_year = parse_single_date(value)
 
         if result is None:
             # Unparsable date.
             return release_date, release_month, release_year
 
-        if granularity != 'y':
+        if granularity != "y":
             release_date = result.date()
         release_year = result.year
-        if granularity in ('m', 'd'):
+        if granularity in ("m", "d"):
             release_month = result.month
 
         return release_date, release_month, release_year
@@ -1200,7 +1260,7 @@ def parse_datacite_dates(dates):
 
     for prio in date_type_prio:
         for item in dates:
-            if not item.get('dateType') == prio:
+            if not item.get("dateType") == prio:
                 continue
 
             release_date, release_month, release_year = parse_item(item)
@@ -1224,45 +1284,49 @@ def parse_datacite_dates(dates):
 
     return release_date, release_month, release_year
 
+
 def index_form_to_display_name(s):
     """
     Try to convert an index form name, like 'Razis, Panos A' into display_name,
     e.g. 'Panos A Razis'.
     """
-    if ',' not in s:
+    if "," not in s:
         return s
-    skip_on_chars = ['(', ')', '*']
+    skip_on_chars = ["(", ")", "*"]
     for char in skip_on_chars:
         if char in s:
             return s
-    if s.count(',') > 1:
+    if s.count(",") > 1:
         # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
         return s
 
     # Not names, but sprinkled in fields where authors live.
-    stopwords = [s.lower() for s in (
-        'Archive',
-        'Collection',
-        'Coordinator',
-        'Department',
-        'Germany',
-        'International',
-        'National',
-        'Netherlands',
-        'Office',
-        'Organisation',
-        'Organization',
-        'Service',
-        'Services',
-        'United States',
-        'University',
-        'Verein',
-        'Volkshochschule',
-    )]
+    stopwords = [
+        s.lower()
+        for s in (
+            "Archive",
+            "Collection",
+            "Coordinator",
+            "Department",
+            "Germany",
+            "International",
+            "National",
+            "Netherlands",
+            "Office",
+            "Organisation",
+            "Organization",
+            "Service",
+            "Services",
+            "United States",
+            "University",
+            "Verein",
+            "Volkshochschule",
+        )
+    ]
     lower = s.lower()
     for stop in stopwords:
         if stop in lower:
             return s
 
-    a, b = s.split(',')
-    return '{} {}'.format(b.strip(), a.strip())
+    a, b = s.split(",")
+    return "{} {}".format(b.strip(), a.strip())
diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py
index 3d280fb7..603a6271 100644
--- a/python/fatcat_tools/importers/dblp_container.py
+++ b/python/fatcat_tools/importers/dblp_container.py
@@ -1,4 +1,3 @@
-
 """
 Importer for DBLP container-level (journal/conference/series) metadata,
 pre-scraped in to JSON from HTML pages.
@@ -13,17 +12,17 @@ from fatcat_tools.normal import clean_str
 
 
 class DblpContainerImporter(EntityImporter):
+    def __init__(
+        self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs
+    ):
 
-    def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs):
-
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of container-level metadata scraped from dblp HTML")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = kwargs.get(
+            "editgroup_description",
+            "Automated import of container-level metadata scraped from dblp HTML",
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpContainerImporter")
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
         self.dblp_container_map_output = dblp_container_map_output
         self.read_dblp_container_map_file(dblp_container_map_file)
@@ -40,7 +39,10 @@ class DblpContainerImporter(EntityImporter):
             assert len(container_id) == 26
             self._dblp_container_map[prefix] = container_id
             print("\t".join([prefix, container_id]), file=self.dblp_container_map_output)
-        print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+        print(
+            "Got {} existing dblp container mappings.".format(len(self._dblp_container_map)),
+            file=sys.stderr,
+        )
 
     def lookup_dblp_prefix(self, prefix):
         if not prefix:
@@ -57,48 +59,48 @@ class DblpContainerImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
-        dblp_prefix = row.get('key') or row.get('dblp_prefix')
+        dblp_prefix = row.get("key") or row.get("dblp_prefix")
         assert dblp_prefix
-        assert row['title']
+        assert row["title"]
 
         container_type = None
-        if dblp_prefix.startswith('conf/'):
+        if dblp_prefix.startswith("conf/"):
             container_type = "conference-series"
-        elif dblp_prefix.startswith('journals/'):
+        elif dblp_prefix.startswith("journals/"):
             container_type = "journal"
-        elif dblp_prefix.startswith('series/'):
+        elif dblp_prefix.startswith("series/"):
             container_type = "book-series"
 
         issnl = None
-        for issn in row.get('issns', []):
+        for issn in row.get("issns", []):
             issnl = self.issn2issnl(issn)
             if issnl:
                 break
 
         extra = {
-            'dblp': {
-                'prefix': dblp_prefix,
+            "dblp": {
+                "prefix": dblp_prefix,
             },
         }
 
-        if row.get('homepage_url'):
-            extra['urls'] = [row['homepage_url']]
+        if row.get("homepage_url"):
+            extra["urls"] = [row["homepage_url"]]
 
-        if row.get('acronym'):
-            extra['acronym'] = row['acronym']
+        if row.get("acronym"):
+            extra["acronym"] = row["acronym"]
 
         ce = fatcat_openapi_client.ContainerEntity(
-            name=clean_str(row['title']),
+            name=clean_str(row["title"]),
             container_type=container_type,
             issnl=issnl,
-            wikidata_qid=row.get('wikidata_qid'),
+            wikidata_qid=row.get("wikidata_qid"),
             extra=extra,
         )
         return ce
 
     def try_update(self, ce):
 
-        dblp_prefix = ce.extra['dblp']['prefix']
+        dblp_prefix = ce.extra["dblp"]["prefix"]
         existing = None
         existing_container_id = self.lookup_dblp_prefix(dblp_prefix)
         if existing_container_id:
@@ -123,8 +125,11 @@ class DblpContainerImporter(EntityImporter):
             return True
 
         if existing:
-            self.counts['exists'] += 1
-            print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output)
+            self.counts["exists"] += 1
+            print(
+                "\t".join([ce.extra["dblp"]["prefix"], existing.ident]),
+                file=self.dblp_container_map_output,
+            )
             return False
 
         # shouldn't get here
@@ -135,11 +140,17 @@ class DblpContainerImporter(EntityImporter):
         Because we want to print a prefix/container_id match for each row, we
         require a special batch insert method
         """
-        eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        eg = self.api.create_container_auto_batch(
+            fatcat_openapi_client.ContainerAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
         for c_edit in eg.edits.containers:
             c = self.api.get_container(c_edit.ident)
-            print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output)
+            print(
+                "\t".join([c.extra["dblp"]["prefix"], c.ident]),
+                file=self.dblp_container_map_output,
+            )
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 6d028f2f..5baa6cd6 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -1,4 +1,3 @@
-
 """
 Importer for DBLP release-level (article/paper/etc) XML metadata.
 
@@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict
 
 
 class DblpReleaseImporter(EntityImporter):
-
-    def __init__(self,
-                 api,
-                 dblp_container_map_file=None,
-                 **kwargs):
+    def __init__(self, api, dblp_container_map_file=None, **kwargs):
 
         eg_desc = kwargs.get(
-            'editgroup_description',
-            "Automated import of dblp metadata via XML records"
+            "editgroup_description", "Automated import of dblp metadata via XML records"
         )
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent',
-                                         'fatcat_tools.DblpReleaseImporter')
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter")
         # ensure default is to not do updates with this worker (override super() default)
-        kwargs['do_updates'] = kwargs.get("do_updates", False)
-        super().__init__(api,
-                         editgroup_description=eg_desc,
-                         editgroup_extra=eg_extra,
-                         **kwargs)
+        kwargs["do_updates"] = kwargs.get("do_updates", False)
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
         self.dump_json_mode = kwargs.get("dump_json_mode", False)
         self.this_year = datetime.datetime.now().year
@@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter):
         "phdthesis",
         "mastersthesis",
         "www",
-        #"data",  # no instances in 2020-11 dump
+        # "data",  # no instances in 2020-11 dump
     ]
 
     def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
         self._dblp_container_map = dict()
         if not dblp_container_map_file:
-            print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr)
+            print(
+                "Not loading a dblp prefix container map file; entities will fail to import",
+                file=sys.stderr,
+            )
             return
         print("Loading dblp prefix container map file...", file=sys.stderr)
         for line in dblp_container_map_file:
@@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter):
             container_id = container_id.strip()
             assert len(container_id) == 26
             self._dblp_container_map[prefix] = container_id
-        print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+        print(
+            "Got {} dblp container mappings.".format(len(self._dblp_container_map)),
+            file=sys.stderr,
+        )
 
     def lookup_dblp_prefix(self, prefix):
         if not prefix:
@@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter):
 
     def want(self, xml_elem):
         if xml_elem.name not in self.ELEMENT_TYPES:
-            self.counts['skip-type'] += 1
+            self.counts["skip-type"] += 1
             return False
-        if not xml_elem.get('key'):
-            self.counts['skip-no-key'] += 1
+        if not xml_elem.get("key"):
+            self.counts["skip-no-key"] += 1
             return False
-        if xml_elem['key'].startswith('homepage/'):
-            self.counts['skip-type-homepage'] += 1
+        if xml_elem["key"].startswith("homepage/"):
+            self.counts["skip-type-homepage"] += 1
             return False
         return True
 
@@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter):
         - isbn
         """
 
-        dblp_key = xml_elem.get('key')
+        dblp_key = xml_elem.get("key")
         if not dblp_key:
-            self.counts['skip-empty-key'] += 1
+            self.counts["skip-empty-key"] += 1
             return False
-        dblp_key_type = dblp_key.split('/')[0]
+        dblp_key_type = dblp_key.split("/")[0]
 
         # dblp_prefix may be used for container lookup
         dblp_prefix = None
-        if dblp_key_type in ('journals', 'conf'):
-            dblp_prefix = '/'.join(dblp_key.split('/')[:2])
-        elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
-            dblp_prefix = '/'.join(dblp_key.split('/')[:-1])
+        if dblp_key_type in ("journals", "conf"):
+            dblp_prefix = "/".join(dblp_key.split("/")[:2])
+        elif dblp_key_type in ("series", "reference", "tr", "books"):
+            dblp_prefix = "/".join(dblp_key.split("/")[:-1])
 
-        publtype = xml_elem.get('publtype') or None
+        publtype = xml_elem.get("publtype") or None
 
         dblp_type = xml_elem.name
         if dblp_type not in self.ELEMENT_TYPES:
-            self.counts[f'skip-dblp-type:{dblp_type}'] += 1
+            self.counts[f"skip-dblp-type:{dblp_type}"] += 1
 
-        if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
-            self.counts['skip-key-type'] += 1
+        if dblp_key_type in ("homepages", "persons", "dblpnote"):
+            self.counts["skip-key-type"] += 1
             return False
 
-        if dblp_key.startswith('journals/corr/'):
-            self.counts['skip-arxiv-corr'] += 1
+        if dblp_key.startswith("journals/corr/"):
+            self.counts["skip-arxiv-corr"] += 1
             return False
 
         title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
         if not title:
-            self.counts['skip-title'] += 1
+            self.counts["skip-title"] += 1
             return False
-        if title.endswith('.'):
+        if title.endswith("."):
             title = title[:-1]
 
         release_type = None
-        release_stage = 'published'
+        release_stage = "published"
         withdrawn_status = None
 
         # primary releae_type detection: type of XML element, then prefix of key for granularity
-        if dblp_type == 'article':
-            release_type = 'article'
-            if dblp_key_type == 'journals' and publtype != 'informal':
-                release_type = 'article-journal'
-            elif dblp_key_type == 'tr':
-                release_type = 'report'
+        if dblp_type == "article":
+            release_type = "article"
+            if dblp_key_type == "journals" and publtype != "informal":
+                release_type = "article-journal"
+            elif dblp_key_type == "tr":
+                release_type = "report"
             elif title.startswith("Review:"):
-                release_type = 'review'
-        elif dblp_type == 'inproceedings':
-            release_type = 'paper-conference'
-        elif dblp_type == 'book':
-            release_type = 'book'
-        elif dblp_type == 'incollection':
+                release_type = "review"
+        elif dblp_type == "inproceedings":
+            release_type = "paper-conference"
+        elif dblp_type == "book":
+            release_type = "book"
+        elif dblp_type == "incollection":
             # XXX: part vs. chapter?
-            release_type = 'chapter'
-        elif dblp_type == 'data':
-            release_type = 'dataset'
-        elif dblp_type in ('mastersthesis', 'phdthesis'):
-            release_type = 'thesis'
+            release_type = "chapter"
+        elif dblp_type == "data":
+            release_type = "dataset"
+        elif dblp_type in ("mastersthesis", "phdthesis"):
+            release_type = "thesis"
 
         # overrides/extensions of the above
-        if publtype == 'informal':
+        if publtype == "informal":
             # for conferences, seems to indicate peer-review status
             # for journals, seems to indicate things like book reviews; split out above
             pass
-        elif publtype == 'encyclopedia':
-            release_type = 'entry-encyclopedia'
-        elif publtype == 'edited':
+        elif publtype == "encyclopedia":
+            release_type = "entry-encyclopedia"
+        elif publtype == "edited":
             # XXX: article?
-            release_type = 'editorial'
-        elif publtype == 'data':
-            release_type = 'dataset'
-        elif publtype == 'data':
-            release_type = 'dataset'
-        elif publtype == 'software':
-            release_type = 'software'
-        elif publtype == 'widthdrawn':
-            withdrawn_status = 'widthdrawn'
-        elif publtype == 'survey':
+            release_type = "editorial"
+        elif publtype == "data":
+            release_type = "dataset"
+        elif publtype == "data":
+            release_type = "dataset"
+        elif publtype == "software":
+            release_type = "software"
+        elif publtype == "widthdrawn":
+            withdrawn_status = "widthdrawn"
+        elif publtype == "survey":
             # XXX: flag as a review/survey article?
             pass
 
-        #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
+        # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
 
         container_name = None
         booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
@@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter):
         part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)
 
         # block bogus far-future years/dates
-        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+        if release_year is not None and (
+            release_year > (self.this_year + 5) or release_year < 1000
+        ):
             release_month = None
             release_year = None
 
@@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter):
         if isbn:
             ext_ids.isbn13 = isbn
         if ext_ids.doi:
-            self.counts['has-doi'] += 1
+            self.counts["has-doi"] += 1
 
         # dblp-specific extra
         dblp_extra = dict(type=dblp_type)
         note = clean_str(xml_elem.note and xml_elem.note.text)
-        if note and 'base-search.net' not in note:
-            dblp_extra['note'] = note
+        if note and "base-search.net" not in note:
+            dblp_extra["note"] = note
         if part_of_key:
-            dblp_extra['part_of_key'] = part_of_key
+            dblp_extra["part_of_key"] = part_of_key
 
         # generic extra
         extra = dict()
         if not container_id and container_name:
-            extra['container_name'] = container_name
+            extra["container_name"] = container_name
 
-        if series and (dblp_key_type == 'series' or dblp_type == 'book'):
-            extra['series-title'] = series
+        if series and (dblp_key_type == "series" or dblp_type == "book"):
+            extra["series-title"] = series
         elif series:
-            dblp_extra['series'] = series
+            dblp_extra["series"] = series
 
-        if booktitle and dblp_key_type == 'series':
-            extra['container-title'] = booktitle
-        elif booktitle and dblp_key_type == 'conf':
-            extra['event'] = booktitle
+        if booktitle and dblp_key_type == "series":
+            extra["container-title"] = booktitle
+        elif booktitle and dblp_key_type == "conf":
+            extra["event"] = booktitle
         elif booktitle:
-            dblp_extra['booktitle'] = booktitle
+            dblp_extra["booktitle"] = booktitle
 
         if release_year and release_month:
             # TODO: release_month schema migration
-            extra['release_month'] = release_month
+            extra["release_month"] = release_month
 
         if dblp_extra:
-            extra['dblp'] = dblp_extra
+            extra["dblp"] = dblp_extra
         if not extra:
             extra = None
 
@@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter):
             withdrawn_status=withdrawn_status,
             title=title,
             release_year=release_year,
-            #release_date,
+            # release_date,
             publisher=publisher,
             ext_ids=ext_ids,
             contribs=contribs,
@@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter):
 
         if self.dump_json_mode:
             re_dict = entity_to_dict(re, api_client=self.api.api_client)
-            re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem)
-            re_dict['_dblp_prefix'] = dblp_prefix
+            re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem)
+            re_dict["_dblp_prefix"] = dblp_prefix
             print(json.dumps(re_dict, sort_keys=True))
             return False
 
@@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter):
 
         # then try other ext_id lookups
         if not existing:
-            for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
+            for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"):
                 extid_val = getattr(re.ext_ids, extid_type)
                 if not extid_val:
                     continue
-                #print(f"  lookup release type: {extid_type} val: {extid_val}")
+                # print(f"  lookup release type: {extid_type} val: {extid_val}")
                 try:
                     existing = self.api.lookup_release(**{extid_type: extid_val})
                 except fatcat_openapi_client.rest.ApiException as err:
@@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter):
             return True
 
         if not self.do_updates or existing.ext_ids.dblp:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         # logic for whether to do update or skip
-        if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:
-            self.counts['skip-update'] += 1
+        if (
+            existing.container_id and existing.release_type and existing.release_stage
+        ) or existing.ext_ids.arxiv:
+            self.counts["skip-update"] += 1
             return False
 
         # fields to copy over for update
@@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter):
         existing.release_stage = existing.release_stage or re.release_stage
         existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
         existing.container_id = existing.container_id or re.container_id
-        existing.extra['dblp'] = re.extra['dblp']
+        existing.extra["dblp"] = re.extra["dblp"]
         existing.volume = existing.volume or re.volume
         existing.issue = existing.issue or re.issue
         existing.pages = existing.pages or re.pages
 
         try:
             self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
-            self.counts['update'] += 1
+            self.counts["update"] += 1
         except fatcat_openapi_client.rest.ApiException as err:
             # there is a code path where we try to update the same release
             # twice in a row; if that happens, just skip
             # NOTE: API behavior might change in the future?
             if "release_edit_editgroup_id_ident_id_key" in err.body:
-                self.counts['skip-update-conflict'] += 1
+                self.counts["skip-update-conflict"] += 1
                 return False
             else:
                 raise err
@@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter):
         return False
 
     def insert_batch(self, batch):
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
 
     def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
         """
@@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter):
         """
         contribs = []
         index = 0
-        for elem in authors.find_all('author'):
+        for elem in authors.find_all("author"):
             contrib = self.dblp_contrib_single(elem)
             contrib.role = "author"
             contrib.index = index
             contribs.append(contrib)
             index += 1
 
-        for elem in authors.find_all('editor'):
+        for elem in authors.find_all("editor"):
             contrib = self.dblp_contrib_single(elem)
             contrib.role = "editor"
             contribs.append(contrib)
@@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter):
 
         # remove number in author name, if present
         if raw_name.split()[-1].isdigit():
-            raw_name = ' '.join(raw_name.split()[:-1])
+            raw_name = " ".join(raw_name.split()[:-1])
 
-        if elem.get('orcid'):
-            orcid = clean_orcid(elem['orcid'])
+        if elem.get("orcid"):
+            orcid = clean_orcid(elem["orcid"])
             if orcid:
                 creator_id = self.lookup_orcid(orcid)
                 if not creator_id:
@@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter):
         wikidata_qid: Optional[str] = None
         arxiv_id: Optional[str] = None
         hdl: Optional[str] = None
-        for ee in xml_elem.find_all('ee'):
+        for ee in xml_elem.find_all("ee"):
             url = ee.text
             # convert DOI-like domains, which mostly have DOIs anyways
-            if '://doi.acm.org/' in url:
-                url = url.replace('://doi.acm.org/', '://doi.org/')
-            elif '://doi.ieeecomputersociety.org/' in url:
-                url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/')
+            if "://doi.acm.org/" in url:
+                url = url.replace("://doi.acm.org/", "://doi.org/")
+            elif "://doi.ieeecomputersociety.org/" in url:
+                url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/")
 
-            if 'doi.org/10.' in url and not doi:
+            if "doi.org/10." in url and not doi:
                 doi = clean_doi(url)
-            elif 'wikidata.org/entity/Q' in url and not wikidata_qid:
+            elif "wikidata.org/entity/Q" in url and not wikidata_qid:
                 wikidata_qid = clean_wikidata_qid(url)
-            elif '://arxiv.org/abs/' in url and not arxiv_id:
-                arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '')
+            elif "://arxiv.org/abs/" in url and not arxiv_id:
+                arxiv_id = (
+                    url.replace("http://", "")
+                    .replace("https://", "")
+                    .replace("arxiv.org/abs/", "")
+                )
                 arxiv_id = clean_arxiv_id(arxiv_id)
-            elif '://hdl.handle.net' in url and not hdl:
+            elif "://hdl.handle.net" in url and not hdl:
                 hdl = clean_hdl(url)
 
         return fatcat_openapi_client.ReleaseExtIds(
@@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter):
         sandcrawler ingest requests.
         """
         EXTID_PATTERNS = [
-            '://doi.acm.org/',
-            '://doi.ieeecomputersociety.org/',
-            'doi.org/10.',
-            'wikidata.org/entity/Q',
-            '://arxiv.org/abs/',
+            "://doi.acm.org/",
+            "://doi.ieeecomputersociety.org/",
+            "doi.org/10.",
+            "wikidata.org/entity/Q",
+            "://arxiv.org/abs/",
         ]
         urls = []
-        for ee in xml_elem.find_all('ee'):
+        for ee in xml_elem.find_all("ee"):
             url = ee.text
             skip = False
             for pattern in EXTID_PATTERNS:
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 1831c4cd..cd063337 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048
 
 
 class DoajArticleImporter(EntityImporter):
-
-    def __init__(self,
-                 api,
-                 issn_map_file,
-                 **kwargs):
+    def __init__(self, api, issn_map_file, **kwargs):
 
         eg_desc = kwargs.get(
-            'editgroup_description',
-            "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps"
+            "editgroup_description",
+            "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps",
         )
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent',
-                                         'fatcat_tools.DoajArticleImporter')
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter")
         # ensure default is to not do updates with this worker (override super() default)
-        kwargs['do_updates'] = kwargs.get("do_updates", False)
-        super().__init__(api,
-                         issn_map_file=issn_map_file,
-                         editgroup_description=eg_desc,
-                         editgroup_extra=eg_extra,
-                         **kwargs)
+        kwargs["do_updates"] = kwargs.get("do_updates", False)
+        super().__init__(
+            api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs,
+        )
 
         self.this_year = datetime.datetime.now().year
         self.read_issn_map_file(issn_map_file)
@@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter):
         }
         """
 
-        if not obj or not isinstance(obj, dict) or 'bibjson' not in obj:
-            self.counts['skip-empty'] += 1
+        if not obj or not isinstance(obj, dict) or "bibjson" not in obj:
+            self.counts["skip-empty"] += 1
             return None
 
-        bibjson = obj['bibjson']
+        bibjson = obj["bibjson"]
 
-        title = clean_str(bibjson.get('title'), force_xml=True)
+        title = clean_str(bibjson.get("title"), force_xml=True)
         if not title:
-            self.counts['skip-title'] += 1
+            self.counts["skip-title"] += 1
             return False
 
-        container_name = clean_str(bibjson['journal']['title'])
+        container_name = clean_str(bibjson["journal"]["title"])
         container_id = None
         # NOTE: 'issns' not documented in API schema
-        for issn in bibjson['journal']['issns']:
+        for issn in bibjson["journal"]["issns"]:
             issnl = self.issn2issnl(issn)
             if issnl:
                 container_id = self.lookup_issnl(self.issn2issnl(issn))
@@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter):
                 container_name = None
                 break
 
-        volume = clean_str(bibjson['journal'].get('volume'))
+        volume = clean_str(bibjson["journal"].get("volume"))
         # NOTE: this schema seems to use "number" as "issue number"
-        issue = clean_str(bibjson['journal'].get('number'))
-        publisher = clean_str(bibjson['journal'].get('publisher'))
+        issue = clean_str(bibjson["journal"].get("number"))
+        publisher = clean_str(bibjson["journal"].get("publisher"))
 
         try:
-            release_year = int(bibjson.get('year'))
+            release_year = int(bibjson.get("year"))
         except (TypeError, ValueError):
             release_year = None
-        release_month = parse_month(clean_str(bibjson.get('month')))
+        release_month = parse_month(clean_str(bibjson.get("month")))
 
         # block bogus far-future years/dates
-        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+        if release_year is not None and (
+            release_year > (self.this_year + 5) or release_year < 1000
+        ):
             release_month = None
             release_year = None
 
-        license_slug = self.doaj_license_slug(bibjson['journal'].get('license'))
-        country = parse_country_name(bibjson['journal'].get('country'))
+        license_slug = self.doaj_license_slug(bibjson["journal"].get("license"))
+        country = parse_country_name(bibjson["journal"].get("country"))
         language = None
-        for raw in bibjson['journal'].get('language') or []:
+        for raw in bibjson["journal"].get("language") or []:
             language = parse_lang_name(raw)
             if language:
                 break
 
         # pages
         # NOTE: error in API docs? seems like start_page not under 'journal' object
-        start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page'))
-        end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))
+        start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str(
+            bibjson.get("start_page")
+        )
+        end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str(
+            bibjson.get("end_page")
+        )
         pages: Optional[str] = None
         if start_page and end_page:
             pages = f"{start_page}-{end_page}"
         elif start_page:
             pages = start_page
 
-        doaj_article_id = obj['id'].lower()
-        ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)
+        doaj_article_id = obj["id"].lower()
+        ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id)
         abstracts = self.doaj_abstracts(bibjson)
-        contribs = self.doaj_contribs(bibjson.get('author') or [])
+        contribs = self.doaj_contribs(bibjson.get("author") or [])
 
         # DOAJ-specific extra
         doaj_extra = dict()
-        if bibjson.get('subject'):
-            doaj_extra['subject'] = bibjson.get('subject')
-        if bibjson.get('keywords'):
-            doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]
+        if bibjson.get("subject"):
+            doaj_extra["subject"] = bibjson.get("subject")
+        if bibjson.get("keywords"):
+            doaj_extra["keywords"] = [
+                k for k in [clean_str(s) for s in bibjson.get("keywords")] if k
+            ]
 
         # generic extra
         extra = dict()
         if country:
-            extra['country'] = country
+            extra["country"] = country
         if not container_id and container_name:
-            extra['container_name'] = container_name
+            extra["container_name"] = container_name
         if release_year and release_month:
             # TODO: schema migration
-            extra['release_month'] = release_month
+            extra["release_month"] = release_month
 
         if doaj_extra:
-            extra['doaj'] = doaj_extra
+            extra["doaj"] = doaj_extra
         if not extra:
             extra = None
 
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             container_id=container_id,
-            release_type='article-journal',
-            release_stage='published',
+            release_type="article-journal",
+            release_stage="published",
             title=title,
             release_year=release_year,
-            #release_date,
+            # release_date,
             publisher=publisher,
             ext_ids=ext_ids,
             contribs=contribs,
@@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter):
 
         # then try other ext_id lookups
         if not existing:
-            for extid_type in ('doi', 'pmid', 'pmcid'):
+            for extid_type in ("doi", "pmid", "pmcid"):
                 extid_val = getattr(re.ext_ids, extid_type)
                 if not extid_val:
                     continue
-                #print(f"  lookup release type: {extid_type} val: {extid_val}")
+                # print(f"  lookup release type: {extid_type} val: {extid_val}")
                 try:
                     existing = self.api.lookup_release(**{extid_type: extid_val})
                 except fatcat_openapi_client.rest.ApiException as err:
@@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter):
 
         # other logic could go here about skipping updates
         if not self.do_updates or existing.ext_ids.doaj:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         # fields to copy over for update
@@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter):
         existing.release_stage = existing.release_stage or re.release_stage
         existing.container_id = existing.container_id or re.container_id
         existing.abstracts = existing.abstracts or re.abstracts
-        existing.extra['doaj'] = re.extra['doaj']
+        existing.extra["doaj"] = re.extra["doaj"]
         existing.volume = existing.volume or re.volume
         existing.issue = existing.issue or re.issue
         existing.pages = existing.pages or re.pages
@@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter):
 
         try:
             self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
-            self.counts['update'] += 1
+            self.counts["update"] += 1
         except fatcat_openapi_client.rest.ApiException as err:
             # there is a code path where we try to update the same release
             # twice in a row; if that happens, just skip
             # NOTE: API behavior might change in the future?
             if "release_edit_editgroup_id_ident_id_key" in err.body:
-                self.counts['skip-update-conflict'] += 1
+                self.counts["skip-update-conflict"] += 1
                 return False
             else:
                 raise err
@@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter):
         return False
 
     def insert_batch(self, batch):
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
 
     def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]:
-        text = clean_str(bibjson.get('abstract'))
+        text = clean_str(bibjson.get("abstract"))
         if not text or len(text) < 10:
             return []
         if len(text) > MAX_ABSTRACT_LENGTH:
@@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter):
             lang=lang,
         )
 
-        return [abstract,]
+        return [
+            abstract,
+        ]
 
     def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
         """
@@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter):
         contribs = []
         index = 0
         for author in authors:
-            if not author.get('name'):
+            if not author.get("name"):
                 continue
             creator_id = None
-            orcid = clean_orcid(author.get('orcid_id'))
+            orcid = clean_orcid(author.get("orcid_id"))
             if orcid:
                 creator_id = self.lookup_orcid(orcid)
-            contribs.append(fatcat_openapi_client.ReleaseContrib(
-                raw_name=author.get('name'),
-                role='author',
-                index=index,
-                creator_id=creator_id,
-                raw_affiliation=clean_str(author.get('affiliation')),
-            ))
+            contribs.append(
+                fatcat_openapi_client.ReleaseContrib(
+                    raw_name=author.get("name"),
+                    role="author",
+                    index=index,
+                    creator_id=creator_id,
+                    raw_affiliation=clean_str(author.get("affiliation")),
+                )
+            )
             index += 1
         return contribs
 
-    def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
+    def doaj_ext_ids(
+        self, identifiers: List[dict], doaj_article_id: str
+    ) -> fatcat_openapi_client.ReleaseExtIds:
         """
         bibjson.identifier {
             id (string),
@@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter):
         pmid: Optional[str] = None
         pmcid: Optional[str] = None
         for id_obj in identifiers:
-            if not id_obj.get('id'):
+            if not id_obj.get("id"):
                 continue
-            if id_obj['type'].lower() == 'doi':
-                doi = clean_doi(id_obj['id'])
-            elif id_obj['type'].lower() == 'pmid':
-                pmid = clean_pmid(id_obj['id'])
-            elif id_obj['type'].lower() == 'pmcid':
-                pmcid = clean_pmcid(id_obj['id'])
+            if id_obj["type"].lower() == "doi":
+                doi = clean_doi(id_obj["id"])
+            elif id_obj["type"].lower() == "pmid":
+                pmid = clean_pmid(id_obj["id"])
+            elif id_obj["type"].lower() == "pmcid":
+                pmcid = clean_pmcid(id_obj["id"])
 
         return fatcat_openapi_client.ReleaseExtIds(
             doaj=doaj_article_id,
@@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter):
         if not license_list:
             return None
         for license in license_list:
-            if not license.get('open_access'):
+            if not license.get("open_access"):
                 continue
-            slug = license.get('type')
-            if slug.startswith('CC '):
-                slug = slug.replace('CC ', 'cc-').lower()
+            slug = license.get("type")
+            if slug.startswith("CC "):
+                slug = slug.replace("CC ", "cc-").lower()
                 return slug
         return None
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 0951ed84..26584ff3 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -1,4 +1,3 @@
-
 import fatcat_openapi_client
 
 from .common import EntityImporter
@@ -17,19 +16,16 @@ class FileMetaImporter(EntityImporter):
 
     def __init__(self, api, require_grobid=True, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter')
-        kwargs['do_updates'] = kwargs.get("do_updates", True)
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates"
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileMetaImporter")
+        kwargs["do_updates"] = kwargs.get("do_updates", True)
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, row):
-        for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'):
+        for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"):
             if not row.get(k):
-                self.counts['skip-missing-field'] += 1
+                self.counts["skip-missing-field"] += 1
                 return False
         return True
 
@@ -40,11 +36,11 @@ class FileMetaImporter(EntityImporter):
 
         file_meta = row
         fe = fatcat_openapi_client.FileEntity(
-            md5=file_meta['md5hex'],
-            sha1=file_meta['sha1hex'],
-            sha256=file_meta['sha256hex'],
-            size=file_meta['size_bytes'],
-            mimetype=file_meta['mimetype'],
+            md5=file_meta["md5hex"],
+            sha1=file_meta["sha1hex"],
+            sha256=file_meta["sha256hex"],
+            size=file_meta["size_bytes"],
+            mimetype=file_meta["mimetype"],
         )
         return fe
 
@@ -59,11 +55,11 @@ class FileMetaImporter(EntityImporter):
                 raise err
 
         if not existing:
-            self.counts['skip-no-match'] += 1
+            self.counts["skip-no-match"] += 1
             return False
 
-        if (existing.md5 and existing.sha256 and existing.size and existing.mimetype):
-            self.counts['skip-existing-complete'] += 1
+        if existing.md5 and existing.sha256 and existing.size and existing.mimetype:
+            self.counts["skip-existing-complete"] += 1
             return False
 
         existing.md5 = existing.md5 or fe.md5
@@ -75,5 +71,5 @@ class FileMetaImporter(EntityImporter):
         existing = self.generic_file_cleanups(existing)
 
         self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
-        self.counts['update'] += 1
+        self.counts["update"] += 1
         return False
diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py
index 43c2a49c..dd8f5600 100644
--- a/python/fatcat_tools/importers/fileset_generic.py
+++ b/python/fatcat_tools/importers/fileset_generic.py
@@ -1,4 +1,3 @@
-
 import fatcat_openapi_client
 
 from fatcat_tools import entity_from_dict
@@ -20,34 +19,31 @@ class FilesetImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter')
-        kwargs['do_updates'] = bool(kwargs.get("do_updates", False))
+        eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import"
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter")
+        kwargs["do_updates"] = bool(kwargs.get("do_updates", False))
         self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
         # bezerk mode doesn't make sense for this importer
         assert self.bezerk_mode is False
 
     def want(self, row):
-        if not row.get('release_ids'):
-            self.counts['skip-no-release-ids'] += 1
+        if not row.get("release_ids"):
+            self.counts["skip-no-release-ids"] += 1
             return False
-        if not row.get('urls'):
-            self.counts['skip-no-urls'] += 1
+        if not row.get("urls"):
+            self.counts["skip-no-urls"] += 1
             return False
-        if not row.get('manifest'):
-            self.counts['skip-no-files'] += 1
+        if not row.get("manifest"):
+            self.counts["skip-no-files"] += 1
             return False
 
-        for f in row.get('manifest'):
-            for k in ('sha1', 'md5'):
+        for f in row.get("manifest"):
+            for k in ("sha1", "md5"):
                 if not f.get(k):
-                    self.counts['skip-missing-file-field'] += 1
+                    self.counts["skip-missing-file-field"] += 1
                     return False
         return True
 
@@ -66,19 +62,24 @@ class FilesetImporter(EntityImporter):
         if not self.skip_release_fileset_check:
             for release_id in fse.release_ids:
                 # don't catch 404, that would be an error
-                release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs')
-                assert release.state == 'active'
+                release = self.api.get_release(
+                    release_id, expand="filesets", hide="abstracts,refs"
+                )
+                assert release.state == "active"
                 if release.filesets:
-                    self.counts['exists'] += 1
-                    self.counts['exists-via-release-filesets'] += 1
+                    self.counts["exists"] += 1
+                    self.counts["exists-via-release-filesets"] += 1
                     return False
 
         # do the insert
         return True
 
     def insert_batch(self, batch):
-        self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_fileset_auto_batch(
+            fatcat_openapi_client.FilesetAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 0f666652..f7bb5357 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,7 +7,7 @@ import fatcat_openapi_client
 
 from .common import EntityImporter, clean, make_rel_url
 
-MAX_ABSTRACT_BYTES=4096
+MAX_ABSTRACT_BYTES = 4096
 
 
 class GrobidMetadataImporter(EntityImporter):
@@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Import of release and file metadata, as extracted from PDFs by GROBID.")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = kwargs.get(
+            "editgroup_description",
+            "Import of release and file metadata, as extracted from PDFs by GROBID.",
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter")
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         self.longtail_oa = kwargs.get("longtail_oa", False)
 
@@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter):
 
     def parse_record(self, row):
 
-        fields = row.split('\t')
+        fields = row.split("\t")
         sha1_key = fields[0]
         cdx = json.loads(fields[1])
         mimetype = fields[2]
@@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter):
         # TODO: this is where we should check if the file actually has
         # release_ids and/or URLs associated with it
         if existing and not self.bezerk_mode:
-            self.counts['exists'] += 1
-            self.counts['skip'] -= 1
+            self.counts["exists"] += 1
+            self.counts["skip"] -= 1
             return None
 
         release_edit = self.create_release(re)
@@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter):
 
     def parse_grobid_json(self, obj):
 
-        if not obj.get('title'):
+        if not obj.get("title"):
             return None
 
         extra_grobid = dict()
 
-        abstract = obj.get('abstract')
+        abstract = obj.get("abstract")
         if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
             abobj = fatcat_openapi_client.ReleaseAbstract(
-                mimetype="text/plain",
-                content=clean(obj.get('abstract')))
+                mimetype="text/plain", content=clean(obj.get("abstract"))
+            )
             abstracts = [abobj]
         else:
             abstracts = None
 
         contribs = []
-        for i, a in enumerate(obj.get('authors', [])):
-            contribs.append(fatcat_openapi_client.ReleaseContrib(
-                index=i,
-                raw_name=clean(a['name']),
-                given_name=clean(a.get('given_name')),
-                surname=clean(a.get('surname')),
-                role="author",
-                extra=None))
+        for i, a in enumerate(obj.get("authors", [])):
+            contribs.append(
+                fatcat_openapi_client.ReleaseContrib(
+                    index=i,
+                    raw_name=clean(a["name"]),
+                    given_name=clean(a.get("given_name")),
+                    surname=clean(a.get("surname")),
+                    role="author",
+                    extra=None,
+                )
+            )
 
         refs = []
-        for raw in obj.get('citations', []):
+        for raw in obj.get("citations", []):
             cite_extra = dict()
             year = None
-            if raw.get('date'):
+            if raw.get("date"):
                 try:
-                    year = int(raw['date'].strip()[:4])
+                    year = int(raw["date"].strip()[:4])
                 except (IndexError, ValueError):
                     pass
-            for key in ('volume', 'url', 'issue', 'publisher'):
+            for key in ("volume", "url", "issue", "publisher"):
                 if raw.get(key):
                     cite_extra[key] = clean(raw[key])
-            if raw.get('authors'):
-                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
+            if raw.get("authors"):
+                cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
 
             if not cite_extra:
                 cite_extra = None
-            refs.append(fatcat_openapi_client.ReleaseRef(
-                key=clean(raw.get('id')),
-                year=year,
-                title=clean(raw['title']),
-                extra=cite_extra))
+            refs.append(
+                fatcat_openapi_client.ReleaseRef(
+                    key=clean(raw.get("id")),
+                    year=year,
+                    title=clean(raw["title"]),
+                    extra=cite_extra,
+                )
+            )
 
         release_date = None
         release_year = None
-        if obj.get('date'):
+        if obj.get("date"):
             # only returns year, ever?
-            release_year = int(obj['date'][:4])
+            release_year = int(obj["date"][:4])
 
         extra = dict()
-        if obj.get('doi'):
-            extra['doi'] = obj['doi']
-        if obj['journal'] and obj['journal'].get('name'):
-            extra['container_name'] = clean(obj['journal']['name'])
+        if obj.get("doi"):
+            extra["doi"] = obj["doi"]
+        if obj["journal"] and obj["journal"].get("name"):
+            extra["container_name"] = clean(obj["journal"]["name"])
 
         # TODO: ISSN/eISSN handling? or just journal name lookup?
 
         if extra_grobid:
-            extra['grobid'] = extra_grobid
+            extra["grobid"] = extra_grobid
         if self.longtail_oa:
-            extra['longtail_oa'] = True
+            extra["longtail_oa"] = True
         if not extra:
             extra = None
 
-        title = clean(obj['title'], force_xml=True)
+        title = clean(obj["title"], force_xml=True)
         if not title or len(title) < 2:
             return None
 
@@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter):
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=clean(obj['journal'].get('publisher')),
-            volume=clean(obj['journal'].get('volume')),
-            issue=clean(obj['journal'].get('issue')),
+            publisher=clean(obj["journal"].get("publisher")),
+            volume=clean(obj["journal"].get("volume")),
+            issue=clean(obj["journal"].get("issue")),
             abstracts=abstracts,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(),
-            extra=extra)
+            extra=extra,
+        )
         return re
 
     def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):
 
-        sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
+        sha1 = (
+            base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", "")))
+            .decode("ascii")
+            .lower()
+        )
 
         fe = fatcat_openapi_client.FileEntity(
             sha1=sha1,
@@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter):
         )
 
         # parse URLs and CDX
-        original = cdx['url']
-        assert len(cdx['dt']) >= 8
-        wayback = "https://web.archive.org/web/{}/{}".format(
-            cdx['dt'],
-            original)
-        fe.urls.append(
-            fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
+        original = cdx["url"]
+        assert len(cdx["dt"]) >= 8
+        wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
+        fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
         original_url = make_rel_url(original, default_link_rel=self.default_link_rel)
         if original_url is not None:
-            fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]))
+            fe.urls.append(
+                fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])
+            )
 
         return fe
 
@@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter):
         return True
 
     def insert_batch(self, batch):
-        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_file_auto_batch(
+            fatcat_openapi_client.FileAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index f0943c1e..e0a6c3f5 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,4 +1,3 @@
-
 import datetime
 
 import fatcat_openapi_client
@@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url
 
 
 class IngestFileResultImporter(EntityImporter):
-
     def __init__(self, api, require_grobid=True, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
-        kwargs['do_updates'] = kwargs.get("do_updates", False)
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Files crawled from web using sandcrawler ingest tool"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter")
+        kwargs["do_updates"] = kwargs.get("do_updates", False)
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
         self.use_glutton_match = False
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         assert self.default_link_rel
@@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter):
         else:
             print("NOT checking GROBID success")
         self.ingest_request_source_allowlist = [
-            'fatcat-changelog',
-            'fatcat-ingest-container',
-            'fatcat-ingest',
-            'arabesque',
+            "fatcat-changelog",
+            "fatcat-ingest-container",
+            "fatcat-ingest",
+            "arabesque",
             #'mag-corpus',
             #'mag',
-            'unpaywall-corpus',
-            'unpaywall',
+            "unpaywall-corpus",
+            "unpaywall",
             #'s2-corpus',
             #'s2',
-            'doaj',
-            'dblp',
+            "doaj",
+            "dblp",
         ]
-        if kwargs.get('skip_source_allowlist', False):
+        if kwargs.get("skip_source_allowlist", False):
             self.ingest_request_source_allowlist = []
 
     def want_file(self, row) -> bool:
@@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter):
         File-specific part of want(). Generic across general ingest and save-paper-now.
         """
 
-        if not row.get('file_meta'):
-            self.counts['skip-file-meta'] += 1
+        if not row.get("file_meta"):
+            self.counts["skip-file-meta"] += 1
             return False
 
         # type-specific filters
-        if row['request'].get('ingest_type') == 'pdf':
-            if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
-                self.counts['skip-grobid'] += 1
+        if row["request"].get("ingest_type") == "pdf":
+            if self.require_grobid and row.get("grobid", {}).get("status_code") != 200:
+                self.counts["skip-grobid"] += 1
                 return False
-            if row['file_meta'].get('mimetype') not in ("application/pdf",):
-                self.counts['skip-mimetype'] += 1
+            if row["file_meta"].get("mimetype") not in ("application/pdf",):
+                self.counts["skip-mimetype"] += 1
                 return False
-        elif row['request'].get('ingest_type') == 'xml':
-            if row['file_meta'].get('mimetype') not in ("application/xml",
-                    "application/jats+xml", "application/tei+xml", "text/xml"):
-                self.counts['skip-mimetype'] += 1
+        elif row["request"].get("ingest_type") == "xml":
+            if row["file_meta"].get("mimetype") not in (
+                "application/xml",
+                "application/jats+xml",
+                "application/tei+xml",
+                "text/xml",
+            ):
+                self.counts["skip-mimetype"] += 1
                 return False
-        elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']:
+        elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]:
             # we rely on sandcrawler for these checks
             pass
         else:
-            self.counts['skip-ingest-type'] += 1
+            self.counts["skip-ingest-type"] += 1
             return False
 
         return True
@@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter):
         Sandcrawler ingest-specific part of want(). Generic across file and
         webcapture ingest.
         """
-        if row.get('hit') is not True:
-            self.counts['skip-hit'] += 1
+        if row.get("hit") is not True:
+            self.counts["skip-hit"] += 1
             return False
-        source = row['request'].get('ingest_request_source')
+        source = row["request"].get("ingest_request_source")
         if not source:
-            self.counts['skip-ingest_request_source'] += 1
+            self.counts["skip-ingest_request_source"] += 1
             return False
-        if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist:
-            self.counts['skip-ingest_request_source'] += 1
+        if (
+            self.ingest_request_source_allowlist
+            and source not in self.ingest_request_source_allowlist
+        ):
+            self.counts["skip-ingest_request_source"] += 1
             return False
 
-        if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'):
-            self.counts['skip-link-source'] += 1
+        if row["request"].get("link_source") not in (
+            "arxiv",
+            "pmc",
+            "unpaywall",
+            "doi",
+            "mag",
+            "s2",
+            "doaj",
+            "dblp",
+        ):
+            self.counts["skip-link-source"] += 1
             return False
 
-        if source.startswith('savepapernow'):
+        if source.startswith("savepapernow"):
             # never process async savepapernow requests
-            self.counts['skip-savepapernow'] += 1
+            self.counts["skip-savepapernow"] += 1
             return False
 
         return True
@@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter):
 
     def parse_ingest_release_ident(self, row):
 
-        request = row['request']
-        fatcat = request.get('fatcat')
+        request = row["request"]
+        fatcat = request.get("fatcat")
 
         release_ident = None
-        if fatcat and fatcat.get('release_ident'):
-            release_ident = fatcat.get('release_ident')
-        elif request.get('ext_ids'):
+        if fatcat and fatcat.get("release_ident"):
+            release_ident = fatcat.get("release_ident")
+        elif request.get("ext_ids"):
             # if no fatcat ident, try extids
-            for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'):
-                extid = request['ext_ids'].get(extid_type)
+            for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"):
+                extid = request["ext_ids"].get(extid_type)
                 if not extid:
                     continue
-                if extid_type == 'doi':
+                if extid_type == "doi":
                     extid = extid.lower()
                 try:
                     release = self.api.lookup_release(**{extid_type: extid})
@@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter):
                     if err.status == 404:
                         continue
                     elif err.status == 400:
-                        self.counts['warn-extid-invalid'] += 1
+                        self.counts["warn-extid-invalid"] += 1
                         continue
                     raise err
                 # verify release_stage
-                if request.get('release_stage') and release.release_stage:
-                    if request['release_stage'] != release.release_stage:
-                        self.counts['skip-release-stage'] += 1
+                if request.get("release_stage") and release.release_stage:
+                    if request["release_stage"] != release.release_stage:
+                        self.counts["skip-release-stage"] += 1
                         return None
                 release_ident = release.ident
                 break
 
-        if self.use_glutton_match and not release_ident and row.get('grobid'):
+        if self.use_glutton_match and not release_ident and row.get("grobid"):
             # try biblio-glutton extracted hit
-            if row['grobid'].get('fatcat_release'):
-                release_ident = row['grobid']['fatcat_release'].split('_')[-1]
-                self.counts['glutton-match'] += 1
+            if row["grobid"].get("fatcat_release"):
+                release_ident = row["grobid"]["fatcat_release"].split("_")[-1]
+                self.counts["glutton-match"] += 1
 
         return release_ident
 
     def parse_terminal(self, row):
-        terminal = row.get('terminal')
+        terminal = row.get("terminal")
         if not terminal:
             # support old cdx-only ingest results
-            cdx = row.get('cdx')
+            cdx = row.get("cdx")
             if not cdx:
                 return None
             else:
                 terminal = {
-                    'terminal_url': cdx['url'],
-                    'terminal_dt': cdx['datetime'],
-                    'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'),
+                    "terminal_url": cdx["url"],
+                    "terminal_dt": cdx["datetime"],
+                    "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"),
                 }
 
         # work around old schema
-        if 'terminal_url' not in terminal:
-            terminal['terminal_url'] = terminal['url']
-        if 'terminal_dt' not in terminal:
-            terminal['terminal_dt'] = terminal['dt']
+        if "terminal_url" not in terminal:
+            terminal["terminal_url"] = terminal["url"]
+        if "terminal_dt" not in terminal:
+            terminal["terminal_dt"] = terminal["dt"]
 
         # convert CDX-style digits to ISO-style timestamp
-        assert len(terminal['terminal_dt']) == 14
-        terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z"
+        assert len(terminal["terminal_dt"]) == 14
+        terminal["terminal_timestamp"] = (
+            datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat()
+            + "Z"
+        )
         return terminal
 
     def parse_urls(self, row, terminal):
 
-        request = row['request']
+        request = row["request"]
 
         default_rel = self.default_link_rel
-        if request.get('link_source') == 'doi':
-            default_rel = 'publisher'
-        default_rel = request.get('rel', default_rel)
-        url = make_rel_url(terminal['terminal_url'], default_rel)
+        if request.get("link_source") == "doi":
+            default_rel = "publisher"
+        default_rel = request.get("rel", default_rel)
+        url = make_rel_url(terminal["terminal_url"], default_rel)
 
         if not url:
-            self.counts['skip-url'] += 1
+            self.counts["skip-url"] += 1
             return None
         wayback = "https://web.archive.org/web/{}/{}".format(
-            terminal['terminal_dt'],
-            terminal['terminal_url'])
+            terminal["terminal_dt"], terminal["terminal_url"]
+        )
         urls = [url, ("webarchive", wayback)]
 
         urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
@@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter):
 
     def parse_edit_extra(self, row):
 
-        request = row['request']
+        request = row["request"]
         edit_extra = dict()
 
-        if request.get('edit_extra'):
-            edit_extra = request['edit_extra']
+        if request.get("edit_extra"):
+            edit_extra = request["edit_extra"]
 
-        if request.get('ingest_request_source'):
-            edit_extra['ingest_request_source'] = request['ingest_request_source']
-        if request.get('link_source') and request.get('link_source_id'):
-            edit_extra['link_source'] = request['link_source']
-            edit_extra['link_source_id'] = request['link_source_id']
-            if edit_extra['link_source'] == 'doi':
-                edit_extra['link_source_id'] = edit_extra['link_source_id'].lower()
+        if request.get("ingest_request_source"):
+            edit_extra["ingest_request_source"] = request["ingest_request_source"]
+        if request.get("link_source") and request.get("link_source_id"):
+            edit_extra["link_source"] = request["link_source"]
+            edit_extra["link_source_id"] = request["link_source_id"]
+            if edit_extra["link_source"] == "doi":
+                edit_extra["link_source_id"] = edit_extra["link_source_id"].lower()
 
         # GROBID metadata, for SPN requests (when there might not be 'success')
-        if request.get('ingest_type') == 'pdf':
-            if row.get('grobid') and row['grobid'].get('status') != 'success':
-                edit_extra['grobid_status_code'] = row['grobid']['status_code']
-                edit_extra['grobid_version'] = row['grobid'].get('grobid_version')
+        if request.get("ingest_type") == "pdf":
+            if row.get("grobid") and row["grobid"].get("status") != "success":
+                edit_extra["grobid_status_code"] = row["grobid"]["status_code"]
+                edit_extra["grobid_version"] = row["grobid"].get("grobid_version")
 
         return edit_extra
 
     def parse_record(self, row):
 
-        request = row['request']
-        file_meta = row['file_meta']
+        request = row["request"]
+        file_meta = row["file_meta"]
 
         # double check that want() filtered request correctly (eg, old requests)
-        if request.get('ingest_type') not in ('pdf', 'xml'):
-            self.counts['skip-ingest-type'] += 1
+        if request.get("ingest_type") not in ("pdf", "xml"):
+            self.counts["skip-ingest-type"] += 1
             return None
-        assert (request['ingest_type'], file_meta['mimetype']) in [
+        assert (request["ingest_type"], file_meta["mimetype"]) in [
             ("pdf", "application/pdf"),
             ("xml", "application/xml"),
             ("xml", "application/jats+xml"),
@@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter):
         release_ident = self.parse_ingest_release_ident(row)
 
         if not release_ident:
-            self.counts['skip-release-not-found'] += 1
+            self.counts["skip-release-not-found"] += 1
             return None
 
         terminal = self.parse_terminal(row)
         if not terminal:
             # TODO: support archive.org hits?
-            self.counts['skip-no-terminal'] += 1
+            self.counts["skip-no-terminal"] += 1
             return None
 
         urls = self.parse_urls(row, terminal)
 
         fe = fatcat_openapi_client.FileEntity(
-            md5=file_meta['md5hex'],
-            sha1=file_meta['sha1hex'],
-            sha256=file_meta['sha256hex'],
-            size=file_meta['size_bytes'],
-            mimetype=file_meta['mimetype'],
+            md5=file_meta["md5hex"],
+            sha1=file_meta["sha1hex"],
+            sha256=file_meta["sha256hex"],
+            size=file_meta["size_bytes"],
+            mimetype=file_meta["mimetype"],
             release_ids=[release_ident],
             urls=urls,
         )
@@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter):
         # check for existing edits-in-progress with same file hash
         for other in self._entity_queue:
             if other.sha1 == fe.sha1:
-                self.counts['skip-in-queue'] += 1
+                self.counts["skip-in-queue"] += 1
                 return False
 
         if not existing:
@@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter):
         # NOTE: the following checks all assume there is an existing item
         if (fe.release_ids[0] in existing.release_ids) and existing.urls:
             # TODO: could still, in theory update with the new URL?
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         if not self.do_updates:
-            self.counts['skip-update-disabled'] += 1
+            self.counts["skip-update-disabled"] += 1
             return False
 
         # TODO: for now, never update
-        self.counts['skip-update-disabled'] += 1
+        self.counts["skip-update-disabled"] += 1
         return False
 
     def insert_batch(self, batch):
         if self.submit_mode:
-            eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra))
+            eg = self.api.create_editgroup(
+                fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                )
+            )
             for fe in batch:
                 self.api.create_file(eg.editgroup_id, fe)
             self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
         else:
-            self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
-                editgroup=fatcat_openapi_client.Editgroup(
-                    description=self.editgroup_description,
-                    extra=self.editgroup_extra),
-                entity_list=batch))
+            self.api.create_file_auto_batch(
+                fatcat_openapi_client.FileAutoBatch(
+                    editgroup=fatcat_openapi_client.Editgroup(
+                        description=self.editgroup_description, extra=self.editgroup_extra
+                    ),
+                    entity_list=batch,
+                )
+            )
 
 
 class SavePaperNowFileImporter(IngestFileResultImporter):
@@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
 
     def __init__(self, api, submit_mode=True, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter')
-        kwargs['submit_mode'] = submit_mode
-        kwargs['require_grobid'] = False
-        kwargs['do_updates'] = False
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Files crawled after a public 'Save Paper Now' request"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter")
+        kwargs["submit_mode"] = submit_mode
+        kwargs["require_grobid"] = False
+        kwargs["do_updates"] = False
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, row):
 
-        source = row['request'].get('ingest_request_source')
+        source = row["request"].get("ingest_request_source")
         if not source:
-            self.counts['skip-ingest_request_source'] += 1
+            self.counts["skip-ingest_request_source"] += 1
             return False
-        if not source.startswith('savepapernow'):
-            self.counts['skip-not-savepapernow'] += 1
+        if not source.startswith("savepapernow"):
+            self.counts["skip-not-savepapernow"] += 1
             return False
 
-        if row.get('hit') is not True:
-            self.counts['skip-hit'] += 1
+        if row.get("hit") is not True:
+            self.counts["skip-hit"] += 1
             return False
 
         if not self.want_file(row):
@@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter')
-        kwargs['do_updates'] = False
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Webcaptures crawled from web using sandcrawler ingest tool"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter")
+        kwargs["do_updates"] = False
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, row):
 
@@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter):
             return False
 
         # webcapture-specific filters
-        if row['request'].get('ingest_type') != 'html':
-            self.counts['skip-ingest-type'] += 1
+        if row["request"].get("ingest_type") != "html":
+            self.counts["skip-ingest-type"] += 1
             return False
-        if not row.get('file_meta'):
-            self.counts['skip-file-meta'] += 1
+        if not row.get("file_meta"):
+            self.counts["skip-file-meta"] += 1
             return False
-        if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
-            self.counts['skip-mimetype'] += 1
+        if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+            self.counts["skip-mimetype"] += 1
             return False
 
         return True
 
     def parse_record(self, row):
 
-        request = row['request']
-        file_meta = row['file_meta']
+        request = row["request"]
+        file_meta = row["file_meta"]
 
         # double check that want() filtered request correctly (eg, old requests)
-        if request.get('ingest_type') != "html":
-            self.counts['skip-ingest-type'] += 1
+        if request.get("ingest_type") != "html":
+            self.counts["skip-ingest-type"] += 1
             return None
-        if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
-            self.counts['skip-mimetype'] += 1
+        if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+            self.counts["skip-mimetype"] += 1
             return None
 
         # identify release by fatcat ident, or extid lookup
         release_ident = self.parse_ingest_release_ident(row)
 
         if not release_ident:
-            self.counts['skip-release-not-found'] += 1
+            self.counts["skip-release-not-found"] += 1
             return None
 
         terminal = self.parse_terminal(row)
         if not terminal:
             # TODO: support archive.org hits?
-            self.counts['skip-no-terminal'] += 1
+            self.counts["skip-no-terminal"] += 1
             return None
 
         urls = self.parse_urls(row, terminal)
-        archive_urls = [u for u in urls if u.rel == 'webarchive']
+        archive_urls = [u for u in urls if u.rel == "webarchive"]
 
-        if terminal['terminal_status_code'] != 200:
-            self.counts['skip-terminal-status-code'] += 1
+        if terminal["terminal_status_code"] != 200:
+            self.counts["skip-terminal-status-code"] += 1
             return None
 
-        terminal_cdx = row['cdx']
-        if 'revisit_cdx' in row:
-            terminal_cdx = row['revisit_cdx']
-        assert terminal_cdx['surt']
-        if terminal_cdx['url'] != terminal['terminal_url']:
-            self.counts['skip-terminal-url-mismatch'] += 1
+        terminal_cdx = row["cdx"]
+        if "revisit_cdx" in row:
+            terminal_cdx = row["revisit_cdx"]
+        assert terminal_cdx["surt"]
+        if terminal_cdx["url"] != terminal["terminal_url"]:
+            self.counts["skip-terminal-url-mismatch"] += 1
             return None
 
         wc_cdx = []
         # primary resource first
-        wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
-            surt=terminal_cdx['surt'],
-            timestamp=terminal['terminal_timestamp'],
-            url=terminal['terminal_url'],
-            mimetype=file_meta['mimetype'],
-            status_code=terminal['terminal_status_code'],
-            sha1=file_meta['sha1hex'],
-            sha256=file_meta['sha256hex'],
-            size=file_meta['size_bytes'],
-        ))
-
-        for resource in row.get('html_resources', []):
-            timestamp = resource['timestamp']
+        wc_cdx.append(
+            fatcat_openapi_client.WebcaptureCdxLine(
+                surt=terminal_cdx["surt"],
+                timestamp=terminal["terminal_timestamp"],
+                url=terminal["terminal_url"],
+                mimetype=file_meta["mimetype"],
+                status_code=terminal["terminal_status_code"],
+                sha1=file_meta["sha1hex"],
+                sha256=file_meta["sha256hex"],
+                size=file_meta["size_bytes"],
+            )
+        )
+
+        for resource in row.get("html_resources", []):
+            timestamp = resource["timestamp"]
             if "+" not in timestamp and "Z" not in timestamp:
                 timestamp += "Z"
-            wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
-                surt=resource['surt'],
-                timestamp=timestamp,
-                url=resource['url'],
-                mimetype=resource.get('mimetype'),
-                size=resource.get('size'),
-                sha1=resource.get('sha1hex'),
-                sha256=resource.get('sha256hex'),
-            ))
+            wc_cdx.append(
+                fatcat_openapi_client.WebcaptureCdxLine(
+                    surt=resource["surt"],
+                    timestamp=timestamp,
+                    url=resource["url"],
+                    mimetype=resource.get("mimetype"),
+                    size=resource.get("size"),
+                    sha1=resource.get("sha1hex"),
+                    sha256=resource.get("sha256hex"),
+                )
+            )
 
         wc = fatcat_openapi_client.WebcaptureEntity(
             cdx=wc_cdx,
             archive_urls=archive_urls,
-            original_url=terminal['terminal_url'],
-            timestamp=terminal['terminal_timestamp'],
+            original_url=terminal["terminal_url"],
+            timestamp=terminal["terminal_timestamp"],
             release_ids=[release_ident],
         )
 
@@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter):
         # check for existing edits-in-progress with same URL
         for other in self._entity_queue:
             if other.original_url == wc.original_url:
-                self.counts['skip-in-queue'] += 1
+                self.counts["skip-in-queue"] += 1
                 return False
 
         # lookup sha1, or create new entity (TODO: API doesn't support this yet)
-        #existing = None
+        # existing = None
 
         # TODO: currently only allow one release per webcapture
         release = self.api.get_release(wc.release_ids[0], expand="webcaptures")
@@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter):
             for other in release.webcaptures:
                 if wc.original_url == other.original_url:
                     # TODO: compare very similar timestamps of same time (different formats)
-                    self.counts['exists'] += 1
+                    self.counts["exists"] += 1
                     return False
-            self.counts['skip-release-has-webcapture'] += 1
+            self.counts["skip-release-has-webcapture"] += 1
             return False
 
         # Ok, if we got here then no existing web capture for (first) release,
@@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter):
 
     def insert_batch(self, batch):
         if self.submit_mode:
-            eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra))
+            eg = self.api.create_editgroup(
+                fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                )
+            )
             for fe in batch:
                 self.api.create_webcapture(eg.editgroup_id, fe)
             self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
         else:
-            self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch(
-                editgroup=fatcat_openapi_client.Editgroup(
-                    description=self.editgroup_description,
-                    extra=self.editgroup_extra),
-                entity_list=batch))
+            self.api.create_webcapture_auto_batch(
+                fatcat_openapi_client.WebcaptureAutoBatch(
+                    editgroup=fatcat_openapi_client.Editgroup(
+                        description=self.editgroup_description, extra=self.editgroup_extra
+                    ),
+                    entity_list=batch,
+                )
+            )
+
 
 class SavePaperNowWebImporter(IngestWebResultImporter):
     """
@@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
 
     def __init__(self, api, submit_mode=True, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter')
-        kwargs['submit_mode'] = submit_mode
-        kwargs['do_updates'] = False
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Webcaptures crawled after a public 'Save Paper Now' request"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter")
+        kwargs["submit_mode"] = submit_mode
+        kwargs["do_updates"] = False
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, row):
         """
@@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
         path, which means allowing hit=false.
         """
 
-        source = row['request'].get('ingest_request_source')
+        source = row["request"].get("ingest_request_source")
         if not source:
-            self.counts['skip-ingest_request_source'] += 1
+            self.counts["skip-ingest_request_source"] += 1
             return False
-        if not source.startswith('savepapernow'):
-            self.counts['skip-not-savepapernow'] += 1
+        if not source.startswith("savepapernow"):
+            self.counts["skip-not-savepapernow"] += 1
             return False
 
         # webcapture-specific filters
-        if row['request'].get('ingest_type') != 'html':
-            self.counts['skip-ingest-type'] += 1
+        if row["request"].get("ingest_type") != "html":
+            self.counts["skip-ingest-type"] += 1
             return False
-        if not row.get('file_meta'):
-            self.counts['skip-file-meta'] += 1
+        if not row.get("file_meta"):
+            self.counts["skip-file-meta"] += 1
             return False
-        if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
-            self.counts['skip-mimetype'] += 1
+        if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+            self.counts["skip-mimetype"] += 1
             return False
 
-        if row.get('status') not in ['success', 'unknown-scope']:
-            self.counts['skip-hit'] += 1
+        if row.get("status") not in ["success", "unknown-scope"]:
+            self.counts["skip-hit"] += 1
             return False
 
         return True
@@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter')
-        kwargs['do_updates'] = False
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Filesets crawled from web using sandcrawler ingest tool"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter")
+        kwargs["do_updates"] = False
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
         self.max_file_count = 300
 
     def want_fileset(self, row):
 
-        if not row.get('manifest') or len(row.get('manifest')) == 0:
-            self.counts['skip-empty-manifest'] += 1
+        if not row.get("manifest") or len(row.get("manifest")) == 0:
+            self.counts["skip-empty-manifest"] += 1
             return False
 
-        if len(row.get('manifest')) == 1:
-            self.counts['skip-single-file'] += 1
+        if len(row.get("manifest")) == 1:
+            self.counts["skip-single-file"] += 1
             return False
 
-        if len(row.get('manifest')) > self.max_file_count:
-            self.counts['skip-too-many-files'] += 1
+        if len(row.get("manifest")) > self.max_file_count:
+            self.counts["skip-too-many-files"] += 1
             return False
 
         return True
@@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
             return False
 
         # fileset-specific filters
-        if row['request'].get('ingest_type') not in ['dataset',]:
-            self.counts['skip-ingest-type'] += 1
+        if row["request"].get("ingest_type") not in [
+            "dataset",
+        ]:
+            self.counts["skip-ingest-type"] += 1
             return False
 
         if not self.want_fileset(row):
@@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
         return True
 
     def parse_fileset_urls(self, row):
-        if not row.get('strategy'):
+        if not row.get("strategy"):
             return []
-        strategy = row['strategy']
+        strategy = row["strategy"]
         urls = []
-        if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'):
-            urls.append(fatcat_openapi_client.FilesetUrl(
-                url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
-                rel="archive-base",
-            ))
-        if row['strategy'].startswith('web-') and row.get('platform_base_url'):
-            urls.append(fatcat_openapi_client.FilesetUrl(
-                url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
-                rel="webarchive-base",
-            ))
+        if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"):
+            urls.append(
+                fatcat_openapi_client.FilesetUrl(
+                    url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
+                    rel="archive-base",
+                )
+            )
+        if row["strategy"].startswith("web-") and row.get("platform_base_url"):
+            urls.append(
+                fatcat_openapi_client.FilesetUrl(
+                    url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
+                    rel="webarchive-base",
+                )
+            )
         # TODO: repository-base
         # TODO: web-base
 
-        if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'):
-            urls.append(fatcat_openapi_client.FilesetUrl(
-                url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
-                rel="archive-bundle",
-            ))
+        if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"):
+            urls.append(
+                fatcat_openapi_client.FilesetUrl(
+                    url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
+                    rel="archive-bundle",
+                )
+            )
 
-        if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'):
-            urls.append(fatcat_openapi_client.FilesetUrl(
-                url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
-                rel="webarchive-bundle",
-            ))
+        if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"):
+            urls.append(
+                fatcat_openapi_client.FilesetUrl(
+                    url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
+                    rel="webarchive-bundle",
+                )
+            )
 
         # add any additional / platform URLs here
-        if row.get('platform_bundle_url'):
-            urls.append(fatcat_openapi_client.FilesetUrl(
-                url=row['platform_bundle_url'],
-                rel="repository-bundle",
-            ))
-        if row.get('platform_base_url'):
-            urls.append(fatcat_openapi_client.FilesetUrl(
-                url=row['platform_bundle_url'],
-                rel="repository-base",
-            ))
+        if row.get("platform_bundle_url"):
+            urls.append(
+                fatcat_openapi_client.FilesetUrl(
+                    url=row["platform_bundle_url"],
+                    rel="repository-bundle",
+                )
+            )
+        if row.get("platform_base_url"):
+            urls.append(
+                fatcat_openapi_client.FilesetUrl(
+                    url=row["platform_bundle_url"],
+                    rel="repository-base",
+                )
+            )
         return urls
 
     def parse_record(self, row):
 
-        request = row['request']
+        request = row["request"]
 
         # double check that want() filtered request correctly
-        if request.get('ingest_type') not in ["dataset",]:
-            self.counts['skip-ingest-type'] += 1
+        if request.get("ingest_type") not in [
+            "dataset",
+        ]:
+            self.counts["skip-ingest-type"] += 1
             return None
 
         # identify release by fatcat ident, or extid lookup
         release_ident = self.parse_ingest_release_ident(row)
 
         if not release_ident:
-            self.counts['skip-release-not-found'] += 1
+            self.counts["skip-release-not-found"] += 1
             return None
 
         entity_extra = dict()
         edit_extra = self.parse_edit_extra(row)
-        edit_extra['ingest_strategy'] = row['ingest_strategy']
-        if row.get('platform'):
-            edit_extra['platform'] = row['platform']
-        if row.get('platform_id'):
-            edit_extra['platform_id'] = row['platform_id']
+        edit_extra["ingest_strategy"] = row["ingest_strategy"]
+        if row.get("platform"):
+            edit_extra["platform"] = row["platform"]
+        if row.get("platform_id"):
+            edit_extra["platform_id"] = row["platform_id"]
 
         entity_urls = self.parse_fileset_urls(row)
         if not entity_urls:
-            self.counts['skip-no-access-url'] += 1
+            self.counts["skip-no-access-url"] += 1
             return None
 
-        assert row['file_count'] == len(row['manifest'])
-        if row['file_count'] > self.max_file_count:
-            self.counts['skip-too-many-manifest-files'] += 1
+        assert row["file_count"] == len(row["manifest"])
+        if row["file_count"] > self.max_file_count:
+            self.counts["skip-too-many-manifest-files"] += 1
             return None
 
         manifest = []
-        for ingest_file in row['manifest']:
+        for ingest_file in row["manifest"]:
             fsf = fatcat_openapi_client.FilesetFile(
-                path=ingest_file['path'],
-                size=ingest_file['size'],
-                md5=ingest_file['md5'],
-                sha1=ingest_file['sha1'],
-                sha256=ingest_file.get('sha256'),
+                path=ingest_file["path"],
+                size=ingest_file["size"],
+                md5=ingest_file["md5"],
+                sha1=ingest_file["sha1"],
+                sha256=ingest_file.get("sha256"),
                 extra=dict(
-                    mimetype=ingest_file['mimetype'],
+                    mimetype=ingest_file["mimetype"],
                 ),
             )
             if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size):
-                self.counts['skip-partial-file-info'] += 1
+                self.counts["skip-partial-file-info"] += 1
                 return None
-            if ingest_file.get('platform_url'):
+            if ingest_file.get("platform_url"):
                 # XXX: should we include this?
-                fsf.extra['original_url'] = ingest_file['platform_url']
-            if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'):
-                fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
+                fsf.extra["original_url"] = ingest_file["platform_url"]
+            if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"):
+                fsf.extra[
+                    "wayback_url"
+                ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
             manifest.append(fsf)
 
         fe = fatcat_openapi_client.FilesetEntity(
@@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
         for other in self._entity_queue:
             # XXX: how to duplicate check?
             if other.original_url == wc.original_url:
-                self.counts['skip-in-queue'] += 1
+                self.counts["skip-in-queue"] += 1
                 return False
 
         # lookup sha1, or create new entity (TODO: API doesn't support this yet)
-        #existing = None
+        # existing = None
 
         # NOTE: in lieu of existing checks (by lookup), only allow one fileset per release
         release = self.api.get_release(wc.release_ids[0], expand="filesets")
@@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
             for other in release.filesets:
                 if wc.original_url == other.original_url:
                     # TODO: compare very similar timestamps of same time (different formats)
-                    self.counts['exists'] += 1
+                    self.counts["exists"] += 1
                     return False
-            self.counts['skip-release-has-fileset'] += 1
+            self.counts["skip-release-has-fileset"] += 1
             return False
 
         return True
 
     def insert_batch(self, batch):
         if self.submit_mode:
-            eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra))
+            eg = self.api.create_editgroup(
+                fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                )
+            )
             for fe in batch:
                 self.api.create_fileset(eg.editgroup_id, fe)
             self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
         else:
-            self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
-                editgroup=fatcat_openapi_client.Editgroup(
-                    description=self.editgroup_description,
-                    extra=self.editgroup_extra),
-                entity_list=batch))
+            self.api.create_fileset_auto_batch(
+                fatcat_openapi_client.FilesetAutoBatch(
+                    editgroup=fatcat_openapi_client.Editgroup(
+                        description=self.editgroup_description, extra=self.editgroup_extra
+                    ),
+                    entity_list=batch,
+                )
+            )
 
 
 class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
@@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
 
     def __init__(self, api, submit_mode=True, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter')
-        kwargs['submit_mode'] = submit_mode
-        kwargs['do_updates'] = False
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Fileset crawled after a public 'Save Paper Now' request"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter")
+        kwargs["submit_mode"] = submit_mode
+        kwargs["do_updates"] = False
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, row):
 
-        source = row['request'].get('ingest_request_source')
+        source = row["request"].get("ingest_request_source")
         if not source:
-            self.counts['skip-ingest_request_source'] += 1
+            self.counts["skip-ingest_request_source"] += 1
             return False
-        if not source.startswith('savepapernow'):
-            self.counts['skip-not-savepapernow'] += 1
+        if not source.startswith("savepapernow"):
+            self.counts["skip-not-savepapernow"] += 1
             return False
 
-        if row.get('hit') is not True:
-            self.counts['skip-hit'] += 1
+        if row.get("hit") is not True:
+            self.counts["skip-hit"] += 1
             return False
 
         if not self.want_fileset(row):
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 0a983c5e..8e3af416 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,4 +1,3 @@
-
 import datetime
 import sqlite3
 import sys
@@ -33,26 +32,24 @@ def parse_jalc_persons(raw_persons):
 
     # first parse out into language-agnostic dics
     for raw in raw_persons:
-        name = raw.find('name') or None
+        name = raw.find("name") or None
         if name:
-            name = clean(name.get_text().replace('\n', ' '))
-        surname = raw.find('familyName') or None
+            name = clean(name.get_text().replace("\n", " "))
+        surname = raw.find("familyName") or None
         if surname:
-            surname = clean(surname.get_text().replace('\n', ' '))
-        given_name = raw.find('givenName') or None
+            surname = clean(surname.get_text().replace("\n", " "))
+        given_name = raw.find("givenName") or None
         if given_name:
-            given_name = clean(given_name.get_text().replace('\n', ' '))
-        lang = 'en'
+            given_name = clean(given_name.get_text().replace("\n", " "))
+        lang = "en"
         if is_cjk(name):
-            lang = 'ja'
-        if lang == 'en' and surname and given_name:
+            lang = "ja"
+        if lang == "en" and surname and given_name:
             # english names order is flipped
             name = "{} {}".format(given_name, surname)
         rc = fatcat_openapi_client.ReleaseContrib(
-            raw_name=name,
-            surname=surname,
-            given_name=given_name,
-            role="author")
+            raw_name=name, surname=surname, given_name=given_name, role="author"
+        )
         # add an extra hint field; won't end up in serialized object
         rc._lang = lang
         persons.append(rc)
@@ -60,12 +57,12 @@ def parse_jalc_persons(raw_persons):
     if not persons:
         return []
 
-    if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]):
+    if all([p._lang == "en" for p in persons]) or all([p._lang == "ja" for p in persons]):
         # all english names, or all japanese names
         return persons
 
     # for debugging
-    #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
+    # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
     #    print("INTERESTING: {}".format(persons[0]))
 
     start_lang = persons[0]._lang
@@ -74,10 +71,10 @@ def parse_jalc_persons(raw_persons):
         if p._lang == start_lang:
             contribs.append(p)
         else:
-            if p._lang == 'en' and contribs[-1]._lang == 'ja':
+            if p._lang == "en" and contribs[-1]._lang == "ja":
                 eng = p
                 jpn = contribs[-1]
-            elif p._lang == 'ja' and contribs[-1]._lang == 'en':
+            elif p._lang == "ja" and contribs[-1]._lang == "en":
                 eng = contribs[-1]
                 jpn = p
             else:
@@ -85,11 +82,11 @@ def parse_jalc_persons(raw_persons):
                 contribs.append(p)
                 continue
             eng.extra = {
-                'original_name': {
-                    'lang': jpn._lang,
-                    'raw_name': jpn.raw_name,
-                    'given_name': jpn.given_name,
-                    'surname': jpn.surname,
+                "original_name": {
+                    "lang": jpn._lang,
+                    "raw_name": jpn.raw_name,
+                    "given_name": jpn.given_name,
+                    "surname": jpn.surname,
                 },
             }
             contribs[-1] = eng
@@ -105,18 +102,19 @@ class JalcImporter(EntityImporter):
 
     def __init__(self, api, issn_map_file, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of JALC DOI metadata")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter')
-        super().__init__(api,
+        eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata")
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JalcImporter")
+        super().__init__(
+            api,
             issn_map_file=issn_map_file,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
-            **kwargs)
+            **kwargs
+        )
 
-        self.create_containers = kwargs.get('create_containers', True)
-        extid_map_file = kwargs.get('extid_map_file')
+        self.create_containers = kwargs.get("create_containers", True)
+        extid_map_file = kwargs.get("extid_map_file")
         self.extid_map_db = None
         if extid_map_file:
             db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -129,12 +127,27 @@ class JalcImporter(EntityImporter):
 
     def lookup_ext_ids(self, doi):
         if self.extid_map_db is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
-        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
-            [doi.lower()]).fetchone()
+            return dict(
+                core_id=None,
+                pmid=None,
+                pmcid=None,
+                wikidata_qid=None,
+                arxiv_id=None,
+                jstor_id=None,
+            )
+        row = self.extid_map_db.execute(
+            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+        ).fetchone()
         if row is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
-        row = [str(cell or '') or None for cell in row]
+            return dict(
+                core_id=None,
+                pmid=None,
+                pmcid=None,
+                wikidata_qid=None,
+                arxiv_id=None,
+                jstor_id=None,
+            )
+        row = [str(cell or "") or None for cell in row]
         return dict(
             core_id=row[0],
             pmid=row[1],
@@ -163,27 +176,27 @@ class JalcImporter(EntityImporter):
         titles = record.find_all("title")
         if not titles:
             return None
-        title = titles[0].get_text().replace('\n', ' ').strip()
+        title = titles[0].get_text().replace("\n", " ").strip()
         original_title = None
-        if title.endswith('.'):
+        if title.endswith("."):
             title = title[:-1]
         if len(titles) > 1:
-            original_title = titles[1].get_text().replace('\n', ' ').strip()
-            if original_title.endswith('.'):
+            original_title = titles[1].get_text().replace("\n", " ").strip()
+            if original_title.endswith("."):
                 original_title = original_title[:-1]
 
         doi = None
         if record.doi:
             doi = clean_doi(record.doi.string.strip().lower())
-            if doi.startswith('http://dx.doi.org/'):
-                doi = doi.replace('http://dx.doi.org/', '')
-            elif doi.startswith('https://dx.doi.org/'):
-                doi = doi.replace('https://dx.doi.org/', '')
-            elif doi.startswith('http://doi.org/'):
-                doi = doi.replace('http://doi.org/', '')
-            elif doi.startswith('https://doi.org/'):
-                doi = doi.replace('https://doi.org/', '')
-            if not (doi.startswith('10.') and '/' in doi):
+            if doi.startswith("http://dx.doi.org/"):
+                doi = doi.replace("http://dx.doi.org/", "")
+            elif doi.startswith("https://dx.doi.org/"):
+                doi = doi.replace("https://dx.doi.org/", "")
+            elif doi.startswith("http://doi.org/"):
+                doi = doi.replace("http://doi.org/", "")
+            elif doi.startswith("https://doi.org/"):
+                doi = doi.replace("https://doi.org/", "")
+            if not (doi.startswith("10.") and "/" in doi):
                 sys.stderr.write("bogus JALC DOI: {}\n".format(doi))
                 doi = None
         if not doi:
@@ -202,7 +215,9 @@ class JalcImporter(EntityImporter):
         if date:
             date = date.string
             if len(date) == 10:
-                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
+                release_date = datetime.datetime.strptime(
+                    date["completed-date"], DATE_FMT
+                ).date()
                 release_year = release_date.year
                 release_date = release_date.isoformat()
             elif len(date) == 4 and date.isdigit():
@@ -214,7 +229,7 @@ class JalcImporter(EntityImporter):
             if record.endingPage and record.endingPage.string.strip():
                 pages = "{}-{}".format(pages, record.endingPage.string.strip())
         # double check to prevent "-" as pages
-        if pages and pages.strip() == '-':
+        if pages and pages.strip() == "-":
             pages = None
 
         volume = None
@@ -242,9 +257,13 @@ class JalcImporter(EntityImporter):
         container_extra = dict()
 
         if record.publicationName:
-            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()]
+            pubs = [
+                p.get_text().replace("\n", " ").strip()
+                for p in record.find_all("publicationName")
+                if p.get_text()
+            ]
             pubs = [clean(p) for p in pubs if p]
-            assert(pubs)
+            assert pubs
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
             if len(pubs) > 1 and is_cjk(pubs[0]):
@@ -252,10 +271,14 @@ class JalcImporter(EntityImporter):
                 pubs = [pubs[1], pubs[0]]
             container_name = clean(pubs[0])
             if len(pubs) > 1:
-                container_extra['original_name'] = clean(pubs[1])
+                container_extra["original_name"] = clean(pubs[1])
 
         if record.publisher:
-            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()]
+            pubs = [
+                p.get_text().replace("\n", " ").strip()
+                for p in record.find_all("publisher")
+                if p.get_text()
+            ]
             pubs = [p for p in pubs if p]
             if len(pubs) > 1 and pubs[0] == pubs[1]:
                 pubs = [pubs[0]]
@@ -265,20 +288,25 @@ class JalcImporter(EntityImporter):
             if pubs:
                 publisher = clean(pubs[0])
                 if len(pubs) > 1:
-                    container_extra['publisher_aliases'] = pubs[1:]
-
-        if (container_id is None and self.create_containers and (issnl is not None)
-                and container_name):
+                    container_extra["publisher_aliases"] = pubs[1:]
+
+        if (
+            container_id is None
+            and self.create_containers
+            and (issnl is not None)
+            and container_name
+        ):
             # name, type, publisher, issnl
             # extra: issnp, issne, original_name, languages, country
-            container_extra['country'] = 'jp'
-            container_extra['languages'] = ['ja']
+            container_extra["country"] = "jp"
+            container_extra["languages"] = ["ja"]
             ce = fatcat_openapi_client.ContainerEntity(
                 name=container_name,
-                container_type='journal',
+                container_type="journal",
                 publisher=publisher,
                 issnl=issnl,
-                extra=(container_extra or None))
+                extra=(container_extra or None),
+            )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
             # short-cut future imports in same batch
@@ -301,7 +329,7 @@ class JalcImporter(EntityImporter):
         #   group-title
         # always put at least an empty dict here to indicate the DOI registrar
         # (informally)
-        extra['jalc'] = extra_jalc
+        extra["jalc"] = extra_jalc
 
         title = clean(title)
         if not title:
@@ -312,24 +340,24 @@ class JalcImporter(EntityImporter):
             title=title,
             original_title=clean(original_title),
             release_type=release_type,
-            release_stage='published',
+            release_stage="published",
             release_date=release_date,
             release_year=release_year,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
                 doi=doi,
-                pmid=extids['pmid'],
-                pmcid=extids['pmcid'],
-                wikidata_qid=extids['wikidata_qid'],
-                core=extids['core_id'],
-                arxiv=extids['arxiv_id'],
-                jstor=extids['jstor_id'],
+                pmid=extids["pmid"],
+                pmcid=extids["pmcid"],
+                wikidata_qid=extids["wikidata_qid"],
+                core=extids["core_id"],
+                arxiv=extids["arxiv_id"],
+                jstor=extids["jstor_id"],
             ),
             volume=volume,
             issue=issue,
             pages=pages,
             publisher=publisher,
             language=lang,
-            #license_slug
+            # license_slug
             container_id=container_id,
             contribs=contribs,
             extra=extra,
@@ -351,17 +379,20 @@ class JalcImporter(EntityImporter):
         # eventually we'll want to support "updates", but for now just skip if
         # entity already exists
         if existing:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         return True
 
     def insert_batch(self, batch):
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
 
     def parse_file(self, handle):
         """
@@ -374,11 +405,11 @@ class JalcImporter(EntityImporter):
         # 2. iterate over articles, call parse_article on each
         for record in soup.find_all("Description"):
             resp = self.parse_record(record)
-            #print(json.dumps(resp))
+            # print(json.dumps(resp))
             print(resp)
-            #sys.exit(-1)
+            # sys.exit(-1)
 
 
-if __name__=='__main__':
+if __name__ == "__main__":
     parser = JalcImporter(None, None)
     parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index 25d7b3b5..6d1fefa3 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -1,4 +1,3 @@
-
 import fatcat_openapi_client
 
 from .common import EntityImporter, clean
@@ -11,18 +10,20 @@ def or_none(s):
         return None
     return s
 
+
 def truthy(s):
     if s is None:
         return None
     s = s.lower()
 
-    if s in ('true', 't', 'yes', 'y', '1'):
+    if s in ("true", "t", "yes", "y", "1"):
         return True
-    elif s in ('false', 'f', 'no', 'n', '0'):
+    elif s in ("false", "f", "no", "n", "0"):
         return False
     else:
         return None
 
+
 class JournalMetadataImporter(EntityImporter):
     """
     Imports journal metadata ("containers") by ISSN, currently from a custom
@@ -33,17 +34,16 @@ class JournalMetadataImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = kwargs.get(
+            "editgroup_description",
+            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.",
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter")
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, raw_record):
-        if raw_record.get('issnl') and raw_record.get('name'):
+        if raw_record.get("issnl") and raw_record.get("name"):
             return True
         return False
 
@@ -54,52 +54,68 @@ class JournalMetadataImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
-        if not row.get('name'):
+        if not row.get("name"):
             # Name is required (by schema)
             return None
 
         extra = dict()
-        for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
-            'coden', 'aliases', 'original_name', 'first_year', 'last_year',
-            'platform', 'default_license', 'road', 'mimetypes',
-            'sherpa_romeo', 'kbart'):
+        for key in (
+            "issne",
+            "issnp",
+            "languages",
+            "country",
+            "urls",
+            "abbrev",
+            "coden",
+            "aliases",
+            "original_name",
+            "first_year",
+            "last_year",
+            "platform",
+            "default_license",
+            "road",
+            "mimetypes",
+            "sherpa_romeo",
+            "kbart",
+        ):
             if row.get(key):
                 extra[key] = row[key]
         # TODO: not including for now: norwegian, dois/crossref, ia
 
         extra_doaj = dict()
-        if row.get('doaj'):
-            if row['doaj'].get('as_of'):
-                extra_doaj['as_of'] = row['doaj']['as_of']
-            if row['doaj'].get('works'):
-                extra_doaj['works'] = row['doaj']['works']
+        if row.get("doaj"):
+            if row["doaj"].get("as_of"):
+                extra_doaj["as_of"] = row["doaj"]["as_of"]
+            if row["doaj"].get("works"):
+                extra_doaj["works"] = row["doaj"]["works"]
         if extra_doaj:
-            extra['doaj'] = extra_doaj
+            extra["doaj"] = extra_doaj
 
         extra_ia = dict()
         # TODO: would like an ia.longtail_ia flag
-        if row.get('sim'):
+        if row.get("sim"):
             # NB: None case of the .get() here is blech, but othrwise
             # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on
-            extra_ia['sim'] = {
-                'year_spans': row['sim'].get('year_spans'),
+            extra_ia["sim"] = {
+                "year_spans": row["sim"].get("year_spans"),
             }
         if extra_ia:
-            extra['ia'] = extra_ia
+            extra["ia"] = extra_ia
 
-        name = clean(row.get('name'))
+        name = clean(row.get("name"))
         if not name:
             return None
 
         ce = fatcat_openapi_client.ContainerEntity(
-            issnl=row['issnl'],
-            issne=row.get('issne'),
-            issnp=row.get('issnp'),
-            container_type=None, # TODO
+            issnl=row["issnl"],
+            issne=row.get("issne"),
+            issnp=row.get("issnp"),
+            container_type=None,  # TODO
             name=name,
-            publisher=clean(row.get('publisher')),
-            wikidata_qid=None, # TODO
-            extra=extra)
+            publisher=clean(row.get("publisher")),
+            wikidata_qid=None,  # TODO
+            extra=extra,
+        )
         return ce
 
     def try_update(self, ce):
@@ -118,23 +134,26 @@ class JournalMetadataImporter(EntityImporter):
         # for now, only update KBART, and only if there is new content
         if not existing.extra:
             existing.extra = dict()
-        if ce.extra.get('kbart') and (existing.extra.get('kbart') != ce.extra['kbart']):
-            if not existing.extra.get('kbart'):
-                existing.extra['kbart'] = {}
-            existing.extra['kbart'].update(ce.extra['kbart'])
+        if ce.extra.get("kbart") and (existing.extra.get("kbart") != ce.extra["kbart"]):
+            if not existing.extra.get("kbart"):
+                existing.extra["kbart"] = {}
+            existing.extra["kbart"].update(ce.extra["kbart"])
             self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
-            self.counts['update'] += 1
+            self.counts["update"] += 1
             return False
         else:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         # if we got this far, it's a bug
         raise NotImplementedError
 
     def insert_batch(self, batch):
-        self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_container_auto_batch(
+            fatcat_openapi_client.ContainerAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index d37424d6..8c7bfad4 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -1,4 +1,3 @@
-
 import datetime
 import json
 import sys
@@ -12,10 +11,10 @@ from .crossref import CONTAINER_TYPE_MAP
 
 # TODO: more entries?
 JSTOR_CONTRIB_MAP = {
-    'author': 'author',
-    'editor': 'editor',
-    'translator': 'translator',
-    'illustrator': 'illustrator',
+    "author": "author",
+    "editor": "editor",
+    "translator": "translator",
+    "illustrator": "illustrator",
 }
 
 JSTOR_TYPE_MAP = {
@@ -26,6 +25,7 @@ JSTOR_TYPE_MAP = {
     "research-article": "article-journal",
 }
 
+
 class JstorImporter(EntityImporter):
     """
     Importer for JSTOR bulk XML metadata (eg, from their Early Journals
@@ -34,17 +34,18 @@ class JstorImporter(EntityImporter):
 
     def __init__(self, api, issn_map_file, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of JSTOR XML metadata")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter')
-        super().__init__(api,
+        eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata")
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JstorImporter")
+        super().__init__(
+            api,
             issn_map_file=issn_map_file,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
-            **kwargs)
+            **kwargs
+        )
 
-        self.create_containers = kwargs.get('create_containers', True)
+        self.create_containers = kwargs.get("create_containers", True)
 
         self.read_issn_map_file(issn_map_file)
 
@@ -62,20 +63,22 @@ class JstorImporter(EntityImporter):
         extra = dict()
         extra_jstor = dict()
 
-        release_type = JSTOR_TYPE_MAP.get(article['article-type'])
+        release_type = JSTOR_TYPE_MAP.get(article["article-type"])
         title = article_meta.find("article-title")
         if title and title.get_text():
-            title = title.get_text().replace('\n', ' ').strip()
+            title = title.get_text().replace("\n", " ").strip()
         elif title and not title.get_text():
             title = None
 
-        if not title and release_type.startswith('review') and article_meta.product.source:
-            title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text())
+        if not title and release_type.startswith("review") and article_meta.product.source:
+            title = "Review: {}".format(
+                article_meta.product.source.replace("\n", " ").get_text()
+            )
 
         if not title:
             return None
 
-        if title.endswith('.'):
+        if title.endswith("."):
             title = title[:-1]
 
         if "[Abstract]" in title:
@@ -93,12 +96,12 @@ class JstorImporter(EntityImporter):
             title = title[1:-1]
 
         # JSTOR journal-id
-        journal_ids = [j.string for j in journal_meta.find_all('journal-id')]
+        journal_ids = [j.string for j in journal_meta.find_all("journal-id")]
         if journal_ids:
-            extra_jstor['journal_ids'] = journal_ids
+            extra_jstor["journal_ids"] = journal_ids
 
-        journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ')
-        publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ')
+        journal_title = journal_meta.find("journal-title").get_text().replace("\n", " ")
+        publisher = journal_meta.find("publisher-name").get_text().replace("\n", " ")
         issn = journal_meta.find("issn")
         if issn:
             issn = issn.string
@@ -113,13 +116,18 @@ class JstorImporter(EntityImporter):
             container_id = self.lookup_issnl(issnl)
 
         # create container if it doesn't exist
-        if (container_id is None and self.create_containers and (issnl is not None)
-                and journal_title):
+        if (
+            container_id is None
+            and self.create_containers
+            and (issnl is not None)
+            and journal_title
+        ):
             ce = fatcat_openapi_client.ContainerEntity(
                 issnl=issnl,
                 publisher=publisher,
                 container_type=self.map_container_type(release_type),
-                name=clean(journal_title, force_xml=True))
+                name=clean(journal_title, force_xml=True),
+            )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
             self._issnl_id_map[issnl] = container_id
@@ -132,8 +140,8 @@ class JstorImporter(EntityImporter):
         if jstor_id:
             jstor_id = jstor_id.string.strip()
         if not jstor_id and doi:
-            assert doi.startswith('10.2307/')
-            jstor_id = doi.replace('10.2307/', '')
+            assert doi.startswith("10.2307/")
+            jstor_id = doi.replace("10.2307/", "")
         assert jstor_id and int(jstor_id)
 
         contribs = []
@@ -142,13 +150,13 @@ class JstorImporter(EntityImporter):
             for c in cgroup.find_all("contrib"):
                 given = c.find("given-names")
                 if given:
-                    given = clean(given.get_text().replace('\n', ' '))
+                    given = clean(given.get_text().replace("\n", " "))
                 surname = c.find("surname")
                 if surname:
-                    surname = clean(surname.get_text().replace('\n', ' '))
+                    surname = clean(surname.get_text().replace("\n", " "))
                 raw_name = c.find("string-name")
                 if raw_name:
-                    raw_name = clean(raw_name.get_text().replace('\n', ' '))
+                    raw_name = clean(raw_name.get_text().replace("\n", " "))
 
                 if not raw_name:
                     if given and surname:
@@ -156,15 +164,17 @@ class JstorImporter(EntityImporter):
                     elif surname:
                         raw_name = surname
 
-                role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author'))
-                if not role and c.get('contrib-type'):
-                    sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type']))
-                contribs.append(fatcat_openapi_client.ReleaseContrib(
-                    role=role,
-                    raw_name=raw_name,
-                    given_name=given,
-                    surname=surname,
-                ))
+                role = JSTOR_CONTRIB_MAP.get(c.get("contrib-type", "author"))
+                if not role and c.get("contrib-type"):
+                    sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c["contrib-type"]))
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        role=role,
+                        raw_name=raw_name,
+                        given_name=given,
+                        surname=surname,
+                    )
+                )
 
         for i, contrib in enumerate(contribs):
             if contrib.raw_name != "et al.":
@@ -172,14 +182,13 @@ class JstorImporter(EntityImporter):
 
         release_year = None
         release_date = None
-        pub_date = article_meta.find('pub-date')
+        pub_date = article_meta.find("pub-date")
         if pub_date and pub_date.year:
             release_year = int(pub_date.year.string)
             if pub_date.month and pub_date.day:
                 release_date = datetime.date(
-                    release_year,
-                    int(pub_date.month.string),
-                    int(pub_date.day.string))
+                    release_year, int(pub_date.month.string), int(pub_date.day.string)
+                )
                 if release_date.day == 1 and release_date.month == 1:
                     # suspect jan 1st dates get set by JSTOR when actual
                     # date not known (citation needed), so drop them
@@ -208,10 +217,10 @@ class JstorImporter(EntityImporter):
                 warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
 
         # JSTOR issue-id
-        if article_meta.find('issue-id'):
-            issue_id = clean(article_meta.find('issue-id').string)
+        if article_meta.find("issue-id"):
+            issue_id = clean(article_meta.find("issue-id").string)
             if issue_id:
-                extra_jstor['issue_id'] = issue_id
+                extra_jstor["issue_id"] = issue_id
 
         # everything in JSTOR is published
         release_stage = "published"
@@ -225,14 +234,14 @@ class JstorImporter(EntityImporter):
         #   group-title
         #   pubmed: retraction refs
         if extra_jstor:
-            extra['jstor'] = extra_jstor
+            extra["jstor"] = extra_jstor
         if not extra:
             extra = None
 
         re = fatcat_openapi_client.ReleaseEntity(
-            #work_id
+            # work_id
             title=title,
-            #original_title
+            # original_title
             release_type=release_type,
             release_stage=release_stage,
             release_date=release_date,
@@ -246,21 +255,16 @@ class JstorImporter(EntityImporter):
             pages=pages,
             publisher=publisher,
             language=language,
-            #license_slug
-
+            # license_slug
             # content, mimetype, lang
-            #abstracts=abstracts,
-
+            # abstracts=abstracts,
             contribs=contribs,
-
             # key, year, container_name, title, locator
             # extra: volume, authors, issue, publisher, identifiers
-            #refs=refs,
-
+            # refs=refs,
             #   name, type, publisher, issnl
             #   extra: issnp, issne, original_name, languages, country
             container_id=container_id,
-
             extra=extra,
         )
         return re
@@ -289,12 +293,12 @@ class JstorImporter(EntityImporter):
 
         if existing and existing.ext_ids.jstor:
             # don't update if it already has JSTOR ID
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
         elif existing:
             # but do update if only DOI was set
             existing.ext_ids.jstor = re.ext_ids.jstor
-            existing.extra['jstor'] = re.extra['jstor']
+            existing.extra["jstor"] = re.extra["jstor"]
             # better release_type detection, and some other fields
             # TODO: don't do this over-writing in the future? assuming here
             # this is a one-time batch import over/extending bootstrap crossref
@@ -304,17 +308,20 @@ class JstorImporter(EntityImporter):
             existing.contribs = re.contribs
             existing.language = re.language
             self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
-            self.counts['update'] += 1
+            self.counts["update"] += 1
             return False
 
         return True
 
     def insert_batch(self, batch):
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
 
     def parse_file(self, handle):
 
@@ -325,8 +332,9 @@ class JstorImporter(EntityImporter):
         for article in soup.find_all("article"):
             resp = self.parse_record(article)
             print(json.dumps(resp))
-            #sys.exit(-1)
+            # sys.exit(-1)
+
 
-if __name__=='__main__':
+if __name__ == "__main__":
     parser = JstorImporter(None, None)
     parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 09807276..7c2a6a87 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -1,4 +1,3 @@
-
 import fatcat_openapi_client
 
 from fatcat_tools.normal import clean_doi
@@ -32,13 +31,13 @@ class MatchedImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Import of large-scale file-to-release match results. Source of metadata varies."
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Import of large-scale file-to-release match results. Source of metadata varies."
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.MatchedImporter")
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         self.default_mimetype = kwargs.get("default_mimetype", None)
 
@@ -46,14 +45,14 @@ class MatchedImporter(EntityImporter):
         return True
 
     def parse_record(self, obj):
-        dois = [d.lower() for d in obj.get('dois', [])]
+        dois = [d.lower() for d in obj.get("dois", [])]
 
         # lookup dois
         re_list = set()
         for doi in dois:
             doi = clean_doi(doi)
             if not doi:
-                self.counts['skip-bad-doi'] += 1
+                self.counts["skip-bad-doi"] += 1
                 return None
             try:
                 re = self.api.lookup_release(doi=doi)
@@ -62,13 +61,22 @@ class MatchedImporter(EntityImporter):
                     raise err
                 re = None
             if re is None:
-                #print("DOI not found: {}".format(doi))
+                # print("DOI not found: {}".format(doi))
                 pass
             else:
                 re_list.add(re.ident)
 
         # look up other external ids
-        for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'):
+        for extid_type in (
+            "arxiv",
+            "pmid",
+            "pmcid",
+            "jstor",
+            "wikidata_qid",
+            "core",
+            "isbn13",
+            "ark",
+        ):
             extid = obj.get(extid_type)
             if extid:
                 try:
@@ -84,49 +92,47 @@ class MatchedImporter(EntityImporter):
 
         release_ids = list(re_list)
         if len(release_ids) == 0:
-            self.counts['skip-no-releases'] += 1
+            self.counts["skip-no-releases"] += 1
             return None
         if len(release_ids) > SANE_MAX_RELEASES:
-            self.counts['skip-too-many-releases'] += 1
+            self.counts["skip-too-many-releases"] += 1
             return None
 
         # parse URLs and CDX
         urls = set()
-        for url in obj.get('urls', []):
+        for url in obj.get("urls", []):
             url = make_rel_url(url, default_link_rel=self.default_link_rel)
             if url is not None:
                 urls.add(url)
-        for cdx in obj.get('cdx', []):
-            original = cdx['url']
-            if cdx.get('dt'):
-                wayback = "https://web.archive.org/web/{}/{}".format(
-                    cdx['dt'],
-                    original)
+        for cdx in obj.get("cdx", []):
+            original = cdx["url"]
+            if cdx.get("dt"):
+                wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
                 urls.add(("webarchive", wayback))
             url = make_rel_url(original, default_link_rel=self.default_link_rel)
             if url is not None:
                 urls.add(url)
         urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
         if len(urls) == 0:
-            self.counts['skip-no-urls'] += 1
+            self.counts["skip-no-urls"] += 1
             return None
         if len(urls) > SANE_MAX_URLS:
-            self.counts['skip-too-many-urls'] += 1
+            self.counts["skip-too-many-urls"] += 1
             return None
 
-        size = obj.get('size')
+        size = obj.get("size")
         if size:
             size = int(size)
 
-        mimetype = obj.get('mimetype', self.default_mimetype)
+        mimetype = obj.get("mimetype", self.default_mimetype)
         if not mimetype and urls:
-            if urls[0].url.endswith('.pdf'):
-                mimetype = 'application/pdf'
+            if urls[0].url.endswith(".pdf"):
+                mimetype = "application/pdf"
 
         fe = fatcat_openapi_client.FileEntity(
-            md5=obj.get('md5'),
-            sha1=obj['sha1'],
-            sha256=obj.get('sha256'),
+            md5=obj.get("md5"),
+            sha1=obj["sha1"],
+            sha256=obj.get("sha256"),
             size=size,
             mimetype=mimetype,
             release_ids=release_ids,
@@ -149,28 +155,30 @@ class MatchedImporter(EntityImporter):
         combined_release_ids = list(set(fe.release_ids + existing.release_ids))
         if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
             # no new release matches *and* there are already existing URLs
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         # check for edit conflicts
         if existing.ident in [e.ident for e in self._edits_inflight]:
-            self.counts['skip-update-inflight'] += 1
+            self.counts["skip-update-inflight"] += 1
             return False
 
         # minimum viable "existing" URL cleanup to fix dupes and broken links:
         # remove 'None' wayback URLs, and set archive.org rel 'archive'
-        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+        existing.urls = [
+            u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url)
+        ]
         for i in range(len(existing.urls)):
             u = existing.urls[i]
-            if u.rel == 'repository' and '://archive.org/download/' in u.url:
-                existing.urls[i].rel = 'archive'
+            if u.rel == "repository" and "://archive.org/download/" in u.url:
+                existing.urls[i].rel = "archive"
 
         # special case: if importing *new* from archive.org arxiv collections,
         # blow away any existing release_id mappings; this is a direct arxiv_id
         # map. This *should* be safe to run in all matched imports.
         is_arxiv = False
         for u in fe.urls:
-            if 'archive.org/download/arxiv' in u.url.lower():
+            if "archive.org/download/arxiv" in u.url.lower():
                 is_arxiv = True
                 break
         if is_arxiv and fe.release_ids:
@@ -178,14 +186,16 @@ class MatchedImporter(EntityImporter):
 
         # merge the existing into this one and update
         existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
-        existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+        existing.urls = [
+            fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls
+        ]
 
         if len(existing.urls) > SANE_MAX_URLS:
-            self.counts['skip-update-too-many-url'] += 1
+            self.counts["skip-update-too-many-url"] += 1
             return None
         existing.release_ids = list(set(fe.release_ids + existing.release_ids))
         if len(existing.release_ids) > SANE_MAX_RELEASES:
-            self.counts['skip-update-too-many-releases'] += 1
+            self.counts["skip-update-too-many-releases"] += 1
             return None
         existing.mimetype = existing.mimetype or fe.mimetype
         existing.size = existing.size or fe.size
@@ -194,12 +204,15 @@ class MatchedImporter(EntityImporter):
         existing.sha256 = existing.sha256 or fe.sha256
         edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
         self._edits_inflight.append(edit)
-        self.counts['update'] += 1
+        self.counts["update"] += 1
         return False
 
     def insert_batch(self, batch):
-        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_file_auto_batch(
+            fatcat_openapi_client.FileAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 3bdd23a1..b514e6e5 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -1,4 +1,3 @@
-
 import sys
 
 import fatcat_openapi_client
@@ -8,7 +7,7 @@ from .common import EntityImporter, clean
 
 def value_or_none(e):
     if type(e) == dict:
-        e = e.get('value')
+        e = e.get("value")
     if type(e) == str and len(e) == 0:
         e = None
     # TODO: this is probably bogus; patched in desperation; remove?
@@ -21,18 +20,17 @@ def value_or_none(e):
             return None
     return e
 
-class OrcidImporter(EntityImporter):
 
+class OrcidImporter(EntityImporter):
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of ORCID metadata, from official bulk releases.")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = kwargs.get(
+            "editgroup_description",
+            "Automated import of ORCID metadata, from official bulk releases.",
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter")
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
     def want(self, raw_record):
         return True
@@ -43,16 +41,16 @@ class OrcidImporter(EntityImporter):
         returns a CreatorEntity
         """
 
-        if 'person' not in obj:
+        if "person" not in obj:
             return False
 
-        name = obj['person']['name']
+        name = obj["person"]["name"]
         if not name:
             return None
         extra = None
-        given = value_or_none(name.get('given-names'))
-        sur = value_or_none(name.get('family-name'))
-        display = value_or_none(name.get('credit-name'))
+        given = value_or_none(name.get("given-names"))
+        sur = value_or_none(name.get("family-name"))
+        display = value_or_none(name.get("credit-name"))
         if display is None:
             # TODO: sorry human beings
             if given and sur:
@@ -61,7 +59,7 @@ class OrcidImporter(EntityImporter):
                 display = sur
             elif given:
                 display = given
-        orcid = obj['orcid-identifier']['path']
+        orcid = obj["orcid-identifier"]["path"]
         if not self.is_orcid(orcid):
             sys.stderr.write("Bad ORCID: {}\n".format(orcid))
             return None
@@ -74,7 +72,8 @@ class OrcidImporter(EntityImporter):
             given_name=clean(given),
             surname=clean(sur),
             display_name=display,
-            extra=extra)
+            extra=extra,
+        )
         return ce
 
     def try_update(self, raw_record):
@@ -88,14 +87,17 @@ class OrcidImporter(EntityImporter):
         # eventually we'll want to support "updates", but for now just skip if
         # entity already exists
         if existing:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         return True
 
     def insert_batch(self, batch):
-        self.api.create_creator_auto_batch(fatcat_openapi_client.CreatorAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_creator_auto_batch(
+            fatcat_openapi_client.CreatorAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 00ad54d0..cfdafcf7 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -1,4 +1,3 @@
-
 import datetime
 import json
 import sys
@@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean
 
 # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
 PUBMED_RELEASE_TYPE_MAP = {
-    #Adaptive Clinical Trial
+    # Adaptive Clinical Trial
     "Address": "speech",
     "Autobiography": "book",
-    #Bibliography
+    # Bibliography
     "Biography": "book",
-    #Case Reports
+    # Case Reports
     "Classical Article": "article-journal",
-    #Clinical Conference
-    #Clinical Study
-    #Clinical Trial
-    #Clinical Trial, Phase I
-    #Clinical Trial, Phase II
-    #Clinical Trial, Phase III
-    #Clinical Trial, Phase IV
-    #Clinical Trial Protocol
-    #Clinical Trial, Veterinary
-    #Collected Works
-    #Comparative Study
-    #Congress
-    #Consensus Development Conference
-    #Consensus Development Conference, NIH
-    #Controlled Clinical Trial
+    # Clinical Conference
+    # Clinical Study
+    # Clinical Trial
+    # Clinical Trial, Phase I
+    # Clinical Trial, Phase II
+    # Clinical Trial, Phase III
+    # Clinical Trial, Phase IV
+    # Clinical Trial Protocol
+    # Clinical Trial, Veterinary
+    # Collected Works
+    # Comparative Study
+    # Congress
+    # Consensus Development Conference
+    # Consensus Development Conference, NIH
+    # Controlled Clinical Trial
     "Dataset": "dataset",
-    #Dictionary
-    #Directory
-    #Duplicate Publication
+    # Dictionary
+    # Directory
+    # Duplicate Publication
     "Editorial": "editorial",
-    #English Abstract   # doesn't indicate that this is abstract-only
-    #Equivalence Trial
-    #Evaluation Studies
-    #Expression of Concern
-    #Festschrift
-    #Government Document
-    #Guideline
+    # English Abstract   # doesn't indicate that this is abstract-only
+    # Equivalence Trial
+    # Evaluation Studies
+    # Expression of Concern
+    # Festschrift
+    # Government Document
+    # Guideline
     "Historical Article": "article-journal",
-    #Interactive Tutorial
+    # Interactive Tutorial
     "Interview": "interview",
     "Introductory Journal Article": "article-journal",
     "Journal Article": "article-journal",
@@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = {
     "Legal Case": "legal_case",
     "Legislation": "legislation",
     "Letter": "letter",
-    #Meta-Analysis
-    #Multicenter Study
-    #News
+    # Meta-Analysis
+    # Multicenter Study
+    # News
     "Newspaper Article": "article-newspaper",
-    #Observational Study
-    #Observational Study, Veterinary
-    #Overall
-    #Patient Education Handout
-    #Periodical Index
-    #Personal Narrative
-    #Portrait
-    #Practice Guideline
-    #Pragmatic Clinical Trial
-    #Publication Components
-    #Publication Formats
-    #Publication Type Category
-    #Randomized Controlled Trial
-    #Research Support, American Recovery and Reinvestment Act
-    #Research Support, N.I.H., Extramural
-    #Research Support, N.I.H., Intramural
-    #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
-    #Research Support, U.S. Gov't, P.H.S.
-    #Review     # in the "literature review" sense, not "product review"
-    #Scientific Integrity Review
-    #Study Characteristics
-    #Support of Research
-    #Systematic Review
+    # Observational Study
+    # Observational Study, Veterinary
+    # Overall
+    # Patient Education Handout
+    # Periodical Index
+    # Personal Narrative
+    # Portrait
+    # Practice Guideline
+    # Pragmatic Clinical Trial
+    # Publication Components
+    # Publication Formats
+    # Publication Type Category
+    # Randomized Controlled Trial
+    # Research Support, American Recovery and Reinvestment Act
+    # Research Support, N.I.H., Extramural
+    # Research Support, N.I.H., Intramural
+    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+    # Research Support, U.S. Gov't, P.H.S.
+    # Review     # in the "literature review" sense, not "product review"
+    # Scientific Integrity Review
+    # Study Characteristics
+    # Support of Research
+    # Systematic Review
     "Technical Report": "report",
-    #Twin Study
-    #Validation Studies
-    #Video-Audio Media
-    #Webcasts
+    # Twin Study
+    # Validation Studies
+    # Video-Audio Media
+    # Webcasts
 }
 
 MONTH_ABBR_MAP = {
-    "Jan":  1, "01":  1,
-    "Feb":  2, "02":  2,
-    "Mar":  3, "03":  3,
-    "Apr":  4, "04":  4,
-    "May":  5, "05":  5,
-    "Jun":  6, "06":  6,
-    "Jul":  7, "07":  7,
-    "Aug":  8, "08":  8,
-    "Sep":  9, "09":  9,
-    "Oct": 10, "10": 10,
-    "Nov": 11, "11": 11,
-    "Dec": 12, "12": 12,
+    "Jan": 1,
+    "01": 1,
+    "Feb": 2,
+    "02": 2,
+    "Mar": 3,
+    "03": 3,
+    "Apr": 4,
+    "04": 4,
+    "May": 5,
+    "05": 5,
+    "Jun": 6,
+    "06": 6,
+    "Jul": 7,
+    "07": 7,
+    "Aug": 8,
+    "08": 8,
+    "Sep": 9,
+    "09": 9,
+    "Oct": 10,
+    "10": 10,
+    "Nov": 11,
+    "11": 11,
+    "Dec": 12,
+    "12": 12,
 }
 
 # From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
@@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = {
     "United Kingdom": "gb",
     "United States": "us",
     "Uruguay": "uy",
-
     # Additions from running over large files
     "Bosnia and Herzegovina": "ba",
-    #"International"
-    "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
+    # "International"
+    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
     "Russia (Federation)": "ru",
     "Scotland": "gb",
     "England": "gb",
@@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter):
 
     def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of PubMed/MEDLINE XML metadata")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
-        super().__init__(api,
+        eg_desc = kwargs.get(
+            "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata"
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter")
+        super().__init__(
+            api,
             issn_map_file=issn_map_file,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
-            **kwargs)
+            **kwargs
+        )
 
         self.lookup_refs = lookup_refs
-        self.create_containers = kwargs.get('create_containers', True)
+        self.create_containers = kwargs.get("create_containers", True)
         self.read_issn_map_file(issn_map_file)
 
     def want(self, obj):
@@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter):
                 release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
                 break
         if pub_types:
-            extra_pubmed['pub_types'] = pub_types
+            extra_pubmed["pub_types"] = pub_types
         if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
             release_type = "retraction"
             retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
             if retraction_of:
                 if retraction_of.RefSource:
-                    extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+                    extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string
                 if retraction_of.PMID:
-                    extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+                    extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string
 
         # everything in medline is published
         release_stage = "published"
@@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter):
         elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
             withdrawn_status = "concern"
 
-        pages = medline.find('MedlinePgn')
+        pages = medline.find("MedlinePgn")
         if pages:
             pages = pages.string
 
-        title = medline.Article.ArticleTitle.get_text() # always present
+        title = medline.Article.ArticleTitle.get_text()  # always present
         if title:
-            title = title.replace('\n', ' ')
-            if title.endswith('.'):
+            title = title.replace("\n", " ")
+            if title.endswith("."):
                 title = title[:-1]
             # this hides some "special" titles, but the vast majority are
             # translations; translations don't always include the original_title
-            if title.startswith('[') and title.endswith(']'):
+            if title.startswith("[") and title.endswith("]"):
                 title = title[1:-1]
         else:
             # will filter out later
@@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter):
         original_title = medline.Article.find("VernacularTitle", recurse=False)
         if original_title:
             original_title = original_title.get_text() or None
-            original_title = original_title.replace('\n', ' ')
-            if original_title and original_title.endswith('.'):
+            original_title = original_title.replace("\n", " ")
+            if original_title and original_title.endswith("."):
                 original_title = original_title[:-1]
 
         if original_title and not title:
@@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter):
             else:
                 language = LANG_MAP_MARC.get(language)
                 if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
-                    warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
+                    warnings.warn(
+                        "MISSING MARC LANG: {}".format(medline.Article.Language.string)
+                    )
 
         ### Journal/Issue Metadata
         # MedlineJournalInfo is always present
@@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter):
             country_name = mji.Country.string.strip()
             country_code = COUNTRY_NAME_MAP.get(country_name)
             if country_code:
-                container_extra['country'] = country_code
+                container_extra["country"] = country_code
             elif country_name:
-                container_extra['country_name'] = country_name
+                container_extra["country_name"] = country_name
         if mji.find("ISSNLinking"):
             issnl = mji.ISSNLinking.string
 
@@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter):
         if issnl:
             container_id = self.lookup_issnl(issnl)
 
-        pub_date = medline.Article.find('ArticleDate')
+        pub_date = medline.Article.find("ArticleDate")
         if not pub_date:
             pub_date = journal.PubDate
         if not pub_date:
@@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter):
                     release_date = datetime.date(
                         release_year,
                         MONTH_ABBR_MAP[pub_date.Month.string],
-                        int(pub_date.Day.string))
+                        int(pub_date.Day.string),
+                    )
                     release_date = release_date.isoformat()
                 except ValueError as ve:
                     print("bad date, skipping: {}".format(ve), file=sys.stderr)
@@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter):
             if len(medline_date) >= 4 and medline_date[:4].isdigit():
                 release_year = int(medline_date[:4])
                 if release_year < 1300 or release_year > 2040:
-                    print("bad medline year, skipping: {}".format(release_year), file=sys.stderr)
+                    print(
+                        "bad medline year, skipping: {}".format(release_year), file=sys.stderr
+                    )
                     release_year = None
             else:
-                print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
+                print(
+                    "unparsable medline date, skipping: {}".format(medline_date),
+                    file=sys.stderr,
+                )
 
         if journal.find("Title"):
             container_name = journal.Title.get_text()
 
-        if (container_id is None and self.create_containers and (issnl is not None)
-                and container_name):
+        if (
+            container_id is None
+            and self.create_containers
+            and (issnl is not None)
+            and container_name
+        ):
             # name, type, publisher, issnl
             # extra: original_name, languages, country
             ce = fatcat_openapi_client.ContainerEntity(
                 name=container_name,
-                container_type='journal',
-                #NOTE: publisher not included
+                container_type="journal",
+                # NOTE: publisher not included
                 issnl=issnl,
                 issnp=issnp,
-                extra=(container_extra or None))
+                extra=(container_extra or None),
+            )
             ce_edit = self.create_container(ce)
             container_id = ce_edit.ident
             self._issnl_id_map[issnl] = container_id
@@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter):
         # "All abstracts are in English"
         abstracts = []
         primary_abstract = medline.find("Abstract")
-        if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'):
-            joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")])
+        if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"):
+            joined = "\n".join(
+                [m.get_text() for m in primary_abstract.find_all("AbstractText")]
+            )
             abst = fatcat_openapi_client.ReleaseAbstract(
                 content=joined,
                 mimetype="text/plain",
@@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter):
                 )
                 if abst.content:
                     abstracts.append(abst)
-                if abstract.find('math'):
+                if abstract.find("math"):
                     abst = fatcat_openapi_client.ReleaseAbstract(
                         # strip the <AbstractText> tags
                         content=str(abstract)[14:-15],
@@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter):
         other_abstracts = medline.find_all("OtherAbstract")
         for other in other_abstracts:
             lang = "en"
-            if other.get('Language'):
-                lang = LANG_MAP_MARC.get(other['Language'])
+            if other.get("Language"):
+                lang = LANG_MAP_MARC.get(other["Language"])
             abst = fatcat_openapi_client.ReleaseAbstract(
                 content=other.AbstractText.get_text().strip(),
                 mimetype="text/plain",
@@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter):
                 surname = None
                 raw_name = None
                 if author.ForeName:
-                    given_name = author.ForeName.get_text().replace('\n', ' ')
+                    given_name = author.ForeName.get_text().replace("\n", " ")
                 if author.LastName:
-                    surname = author.LastName.get_text().replace('\n', ' ')
+                    surname = author.LastName.get_text().replace("\n", " ")
                 if given_name and surname:
                     raw_name = "{} {}".format(given_name, surname)
                 elif surname:
                     raw_name = surname
                 if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
-                    raw_name = author.CollectiveName.get_text().replace('\n', ' ')
+                    raw_name = author.CollectiveName.get_text().replace("\n", " ")
                 contrib_extra = dict()
                 orcid = author.find("Identifier", Source="ORCID")
                 if orcid:
@@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter):
                         orcid = orcid.replace("http://orcid.org/", "")
                     elif orcid.startswith("https://orcid.org/"):
                         orcid = orcid.replace("https://orcid.org/", "")
-                    elif '-' not in orcid:
+                    elif "-" not in orcid:
                         orcid = "{}-{}-{}-{}".format(
                             orcid[0:4],
                             orcid[4:8],
@@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter):
                             orcid[12:16],
                         )
                     creator_id = self.lookup_orcid(orcid)
-                    contrib_extra['orcid'] = orcid
+                    contrib_extra["orcid"] = orcid
                 affiliations = author.find_all("Affiliation")
                 raw_affiliation = None
                 if affiliations:
-                    raw_affiliation = affiliations[0].get_text().replace('\n', ' ')
+                    raw_affiliation = affiliations[0].get_text().replace("\n", " ")
                     if len(affiliations) > 1:
-                        contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]
+                        contrib_extra["more_affiliations"] = [
+                            ra.get_text().replace("\n", " ") for ra in affiliations[1:]
+                        ]
                 if author.find("EqualContrib"):
                     # TODO: schema for this?
-                    contrib_extra['equal'] = True
-                contribs.append(fatcat_openapi_client.ReleaseContrib(
-                    raw_name=raw_name,
-                    given_name=given_name,
-                    surname=surname,
-                    role="author",
-                    raw_affiliation=raw_affiliation,
-                    creator_id=creator_id,
-                    extra=contrib_extra,
-                ))
-
-            if medline.AuthorList['CompleteYN'] == 'N':
+                    contrib_extra["equal"] = True
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        raw_name=raw_name,
+                        given_name=given_name,
+                        surname=surname,
+                        role="author",
+                        raw_affiliation=raw_affiliation,
+                        creator_id=creator_id,
+                        extra=contrib_extra,
+                    )
+                )
+
+            if medline.AuthorList["CompleteYN"] == "N":
                 contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))
 
         for i, contrib in enumerate(contribs):
@@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter):
             # note that Reference always exists within a ReferenceList, but
             # that there may be multiple ReferenceList (eg, sometimes one per
             # Reference)
-            for ref in pubmed.find_all('Reference'):
+            for ref in pubmed.find_all("Reference"):
                 ref_extra = dict()
                 ref_doi = ref.find("ArticleId", IdType="doi")
                 if ref_doi:
@@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter):
                     ref_pmid = clean_pmid(ref_pmid.string)
                 ref_release_id = None
                 if ref_doi:
-                    ref_extra['doi'] = ref_doi
+                    ref_extra["doi"] = ref_doi
                     if self.lookup_refs:
                         ref_release_id = self.lookup_doi(ref_doi)
                 if ref_pmid:
-                    ref_extra['pmid'] = ref_pmid
+                    ref_extra["pmid"] = ref_pmid
                     if self.lookup_refs:
                         ref_release_id = self.lookup_pmid(ref_pmid)
                 ref_raw = ref.Citation
                 if ref_raw:
-                    ref_extra['unstructured'] = ref_raw.get_text()
+                    ref_extra["unstructured"] = ref_raw.get_text()
                 if not ref_extra:
                     ref_extra = None
-                refs.append(fatcat_openapi_client.ReleaseRef(
-                    target_release_id=ref_release_id,
-                    extra=ref_extra,
-                ))
+                refs.append(
+                    fatcat_openapi_client.ReleaseRef(
+                        target_release_id=ref_release_id,
+                        extra=ref_extra,
+                    )
+                )
         if not refs:
             refs = None
 
@@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter):
         #   group-title
         #   pubmed: retraction refs
         if extra_pubmed:
-            extra['pubmed'] = extra_pubmed
+            extra["pubmed"] = extra_pubmed
         if not extra:
             extra = None
 
@@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter):
                 doi=doi,
                 pmid=pmid,
                 pmcid=pmcid,
-                #isbn13     # never in Article
+                # isbn13     # never in Article
             ),
             volume=volume,
             issue=issue,
             pages=pages,
-            #publisher  # not included?
+            # publisher  # not included?
             language=language,
-            #license_slug   # not in MEDLINE
+            # license_slug   # not in MEDLINE
             abstracts=abstracts,
             contribs=contribs,
             refs=refs,
@@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter):
                     raise err
             if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
                 warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format(
-                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)
+                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid
+                )
                 warnings.warn(warn_str)
-                self.counts['warn-pmid-doi-mismatch'] += 1
+                self.counts["warn-pmid-doi-mismatch"] += 1
                 # don't clobber DOI, but do group together
                 re.ext_ids.doi = None
                 re.work_id = existing.work_id
 
         if existing and not self.do_updates:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
             # TODO: any other reasons to do an update?
             # don't update if it already has PMID
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
         elif existing:
             # but do update if only DOI was set
@@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter):
             existing.container_id = existing.container_id or re.container_id
             existing.refs = existing.refs or re.refs
             existing.abstracts = existing.abstracts or re.abstracts
-            existing.extra['pubmed'] = re.extra['pubmed']
+            existing.extra["pubmed"] = re.extra["pubmed"]
 
             # fix stub titles
             if existing.title in [
-                    "OUP accepted manuscript",
-                ]:
+                "OUP accepted manuscript",
+            ]:
                 existing.title = re.title
 
             existing.original_title = existing.original_title or re.original_title
@@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter):
             existing.language = existing.language or re.language
 
             # update subtitle in-place first
-            if not existing.subtitle and existing.extra.get('subtitle'):
-                subtitle = existing.extra.pop('subtitle')
+            if not existing.subtitle and existing.extra.get("subtitle"):
+                subtitle = existing.extra.pop("subtitle")
                 if type(subtitle) == list:
                     subtitle = subtitle[0]
                 if subtitle:
@@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter):
 
             try:
                 self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
-                self.counts['update'] += 1
+                self.counts["update"] += 1
             except fatcat_openapi_client.rest.ApiException as err:
                 # there is a code path where we try to update the same release
                 # twice in a row; if that happens, just skip
                 # NOTE: API behavior might change in the future?
                 if "release_edit_editgroup_id_ident_id_key" in err.body:
-                    self.counts['skip-update-conflict'] += 1
+                    self.counts["skip-update-conflict"] += 1
                     return False
                 else:
                     raise err
@@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter):
         return True
 
     def insert_batch(self, batch):
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
 
     def parse_file(self, handle):
 
@@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter):
         for article in soup.find_all("PubmedArticle"):
             resp = self.parse_record(article)
             print(json.dumps(resp))
-            #sys.exit(-1)
+            # sys.exit(-1)
+
 
-if __name__=='__main__':
+if __name__ == "__main__":
     parser = PubmedImporter(None, None)
     parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
index 77205cee..78eeec7a 100644
--- a/python/fatcat_tools/importers/shadow.py
+++ b/python/fatcat_tools/importers/shadow.py
@@ -1,4 +1,3 @@
-
 import fatcat_openapi_client
 
 from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid
@@ -30,25 +29,25 @@ class ShadowLibraryImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
-        eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+        eg_desc = (
+            kwargs.pop("editgroup_description", None)
+            or "Import of 'Shadow Library' file/release matches"
+        )
+        eg_extra = kwargs.pop("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ShadowLibraryImporter")
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
 
     def want(self, raw_record):
         """
         Only want to import records with complete file-level metadata
         """
-        fm = raw_record['file_meta']
-        if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
-            self.counts['skip-file-meta-incomplete'] += 1
+        fm = raw_record["file_meta"]
+        if not (fm["mimetype"] and fm["md5hex"] and fm["sha256hex"] and fm["size_bytes"]):
+            self.counts["skip-file-meta-incomplete"] += 1
             return False
-        if fm['mimetype'] != 'application/pdf':
-            self.counts['skip-not-pdf'] += 1
+        if fm["mimetype"] != "application/pdf":
+            self.counts["skip-not-pdf"] += 1
             return False
         return True
 
@@ -57,23 +56,23 @@ class ShadowLibraryImporter(EntityImporter):
         We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
         """
 
-        shadow_corpus = obj['shadow']['shadow_corpus']
+        shadow_corpus = obj["shadow"]["shadow_corpus"]
         assert shadow_corpus == shadow_corpus.strip().lower()
-        doi = clean_doi(obj['shadow'].get('doi'))
-        pmid = clean_pmid(obj['shadow'].get('pmid'))
-        isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
-        shadow_id = obj['shadow'].get('shadow_id').strip()
+        doi = clean_doi(obj["shadow"].get("doi"))
+        pmid = clean_pmid(obj["shadow"].get("pmid"))
+        isbn13 = clean_isbn13(obj["shadow"].get("isbn13"))
+        shadow_id = obj["shadow"].get("shadow_id").strip()
         assert shadow_id
 
-        extra = { '{}_id'.format(shadow_corpus): shadow_id }
-        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+        extra = {"{}_id".format(shadow_corpus): shadow_id}
+        for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:
             if not ext_id:
                 continue
-            extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+            extra["{}_{}".format(shadow_corpus, ext_type)] = ext_id
 
         # lookup release via several idents
         re = None
-        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+        for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:
             if not ext_id:
                 continue
             try:
@@ -86,29 +85,31 @@ class ShadowLibraryImporter(EntityImporter):
                 break
 
         if not re:
-            self.counts['skip-release-not-found'] += 1
+            self.counts["skip-release-not-found"] += 1
             return None
 
-        release_ids = [re.ident,]
+        release_ids = [
+            re.ident,
+        ]
 
         # parse single CDX into URLs (if exists)
         urls = []
-        if obj.get('cdx'):
-            url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+        if obj.get("cdx"):
+            url = make_rel_url(obj["cdx"]["url"], default_link_rel=self.default_link_rel)
             if url is not None:
                 urls.append(url)
             wayback = "https://web.archive.org/web/{}/{}".format(
-                obj['cdx']['datetime'],
-                obj['cdx']['url'])
+                obj["cdx"]["datetime"], obj["cdx"]["url"]
+            )
             urls.append(("webarchive", wayback))
         urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
 
         fe = fatcat_openapi_client.FileEntity(
-            md5=obj['file_meta']['md5hex'],
-            sha1=obj['file_meta']['sha1hex'],
-            sha256=obj['file_meta']['sha256hex'],
-            size=int(obj['file_meta']['size_bytes']),
-            mimetype=obj['file_meta']['mimetype'] or None,
+            md5=obj["file_meta"]["md5hex"],
+            sha1=obj["file_meta"]["sha1hex"],
+            sha256=obj["file_meta"]["sha256hex"],
+            size=int(obj["file_meta"]["size_bytes"]),
+            mimetype=obj["file_meta"]["mimetype"] or None,
             release_ids=release_ids,
             urls=urls,
             extra=dict(shadows=extra),
@@ -130,45 +131,50 @@ class ShadowLibraryImporter(EntityImporter):
         if not existing.extra:
             existing.extra = {}
 
-        if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+        if (
+            existing.extra.get("shadows")
+            and list(fe.extra["shadows"].keys())[0] in existing.extra["shadows"]
+        ):
             # already imported from this shadow library; skip
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         # check for edit conflicts
         if existing.ident in [e.ident for e in self._edits_inflight]:
-            self.counts['skip-update-inflight'] += 1
+            self.counts["skip-update-inflight"] += 1
             return False
         if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
             raise Exception("Inflight insert; shouldn't happen")
 
         # minimum viable "existing" URL cleanup to fix dupes and broken links:
         # remove 'None' wayback URLs, and set archive.org rel 'archive'
-        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+        existing.urls = [
+            u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url)
+        ]
         for i in range(len(existing.urls)):
             u = existing.urls[i]
-            if u.rel == 'repository' and '://archive.org/download/' in u.url:
-                existing.urls[i].rel = 'archive'
-            if u.rel == 'social':
-                u.rel = 'academicsocial'
+            if u.rel == "repository" and "://archive.org/download/" in u.url:
+                existing.urls[i].rel = "archive"
+            if u.rel == "social":
+                u.rel = "academicsocial"
 
         # merge the existing into this one and update
         merged_urls = {}
         for u in fe.urls + existing.urls:
             merged_urls[u.url] = u
         existing.urls = list(merged_urls.values())
-        if not existing.extra.get('shadows'):
-            existing.extra['shadows'] = fe.extra['shadows']
+        if not existing.extra.get("shadows"):
+            existing.extra["shadows"] = fe.extra["shadows"]
         else:
-            existing.extra['shadows'].update(fe.extra['shadows'])
+            existing.extra["shadows"].update(fe.extra["shadows"])
 
         # do these "plus ones" because we really want to do these updates when possible
         if len(existing.urls) > SANE_MAX_URLS + 1:
-            self.counts['skip-update-too-many-url'] += 1
+            self.counts["skip-update-too-many-url"] += 1
             return None
         existing.release_ids = list(set(fe.release_ids + existing.release_ids))
         if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
-            self.counts['skip-update-too-many-releases'] += 1
+            self.counts["skip-update-too-many-releases"] += 1
             return None
         existing.mimetype = existing.mimetype or fe.mimetype
         existing.size = existing.size or fe.size
@@ -180,12 +186,15 @@ class ShadowLibraryImporter(EntityImporter):
         # group-level de-dupe
         edit.sha1 = existing.sha1
         self._edits_inflight.append(edit)
-        self.counts['update'] += 1
+        self.counts["update"] += 1
         return False
 
     def insert_batch(self, batch):
-        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_file_auto_batch(
+            fatcat_openapi_client.FileAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 196f86ff..22fefad3 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -33,22 +33,23 @@ REQ_SESSION = requests.Session()
 def parse_wbm_url(url):
     """Takes a wayback machine URL, and returns a tuple:
 
-        (timestamp, datetime, original_url)
+    (timestamp, datetime, original_url)
     """
-    chunks = url.split('/')
+    chunks = url.split("/")
     assert len(chunks) >= 6
-    assert chunks[2] == 'web.archive.org'
-    assert chunks[3] == 'web'
-    return (chunks[4],
-            parse_wbm_timestamp(chunks[4]),
-            '/'.join(chunks[5:]))
+    assert chunks[2] == "web.archive.org"
+    assert chunks[3] == "web"
+    return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
+
 
 def test_parse_wbm_url():
     u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
     assert parse_wbm_url(u) == (
         "20010712114837",
         datetime.datetime(2001, 7, 12, 11, 48, 37),
-        "http://www.dlib.org/dlib/june01/reich/06reich.html")
+        "http://www.dlib.org/dlib/june01/reich/06reich.html",
+    )
+
 
 def parse_wbm_timestamp(timestamp):
     """
@@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp):
     python datetime object (UTC)
     """
     # strip any "im_" or "id_" suffix
-    if timestamp.endswith('_'):
+    if timestamp.endswith("_"):
         timestamp = timestamp[:-3]
     # inflexible; require the full second-precision timestamp
     assert len(timestamp) == 14
@@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp):
         day=int(timestamp[6:8]),
         hour=int(timestamp[8:10]),
         minute=int(timestamp[10:12]),
-        second=int(timestamp[12:14]))
+        second=int(timestamp[12:14]),
+    )
+
 
 def test_parse_wbm_timestamp():
-    assert parse_wbm_timestamp("20010712114837") == \
-        datetime.datetime(2001, 7, 12, 11, 48, 37)
+    assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
+
 
 def fetch_wbm(url):
     resp = REQ_SESSION.get(url)
@@ -78,31 +81,35 @@ def fetch_wbm(url):
     assert resp.content
     return resp.content
 
+
 def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
     sys.stderr.write(embed_url + "\n")
-    assert embed_url.startswith('/web/')
-    embed_url = embed_url.split('/')
+    assert embed_url.startswith("/web/")
+    embed_url = embed_url.split("/")
     timestamp = embed_url[2]
-    if timestamp.endswith('_'):
+    if timestamp.endswith("_"):
         timestamp = timestamp[:-3]
-    url = '/'.join(embed_url[3:])
-    #print((timestamp, url))
-    resp = REQ_SESSION.get(CDX_API_BASE, params=dict(
-        url=url,
-        closest=timestamp,
-        sort="closest",
-        resolveRevisits="true",
-        matchType="exact",
-        limit=1,
-    ))
+    url = "/".join(embed_url[3:])
+    # print((timestamp, url))
+    resp = REQ_SESSION.get(
+        CDX_API_BASE,
+        params=dict(
+            url=url,
+            closest=timestamp,
+            sort="closest",
+            resolveRevisits="true",
+            matchType="exact",
+            limit=1,
+        ),
+    )
     resp.raise_for_status()
-    #print(resp.url)
+    # print(resp.url)
     if resp.content:
-        hit = resp.content.decode('utf-8').split('\n')[0]
+        hit = resp.content.decode("utf-8").split("\n")[0]
         if cdx_output:
             cdx_output.write(hit + "\n")
-        cdx = hit.split(' ')
-        cdx = [x if (x and x != '-') else None for x in cdx]
+        cdx = hit.split(" ")
+        cdx = [x if (x and x != "-") else None for x in cdx]
         webcapture_cdx = WebcaptureCdxLine(
             surt=cdx[0],
             timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
@@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
             sha256=None,
         )
         if verify_hashes:
-            resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format(
-                cdx[1], # raw timestamp
-                webcapture_cdx.url))
+            resp = REQ_SESSION.get(
+                GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url)  # raw timestamp
+            )
             resp.raise_for_status()
             assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
             webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
@@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
     else:
         return None
 
+
 def wayback_url_to_relative(url):
     """
     Wayback URLs can be relative or absolute in rewritten documents. This
     function converts any form of rewritten URL to a relative (to
     web.archive.org) one, or returns None if it isn't a rewritten URL at all.
     """
-    if url.startswith('https://web.archive.org/'):
+    if url.startswith("https://web.archive.org/"):
         url = url[23:]
-    elif url.startswith('http://web.archive.org/'):
+    elif url.startswith("http://web.archive.org/"):
         url = url[22:]
 
-    if url.startswith('/web/'):
+    if url.startswith("/web/"):
         return url
     else:
         return None
 
+
 def extract_embeds(soup):
 
     embeds = set()
 
     # <link href="">
-    for tag in soup.find_all('link', href=True):
-        if tag['rel'] not in ('stylesheet',):
+    for tag in soup.find_all("link", href=True):
+        if tag["rel"] not in ("stylesheet",):
             continue
-        url = wayback_url_to_relative(tag['href'])
+        url = wayback_url_to_relative(tag["href"])
         if url:
             embeds.add(url)
     # <img src="">
-    for tag in soup.find_all('img', src=True):
-        url = wayback_url_to_relative(tag['src'])
+    for tag in soup.find_all("img", src=True):
+        url = wayback_url_to_relative(tag["src"])
         if url:
             embeds.add(url)
 
     # <script src="">
-    for tag in soup.find_all('script', src=True):
-        url = wayback_url_to_relative(tag['src'])
+    for tag in soup.find_all("script", src=True):
+        url = wayback_url_to_relative(tag["src"])
         if url:
             embeds.add(url)
 
     return list(embeds)
 
+
 def static_wayback_webcapture(wayback_url, cdx_output=None):
     """
     Given a complete wayback machine capture URL, like:
@@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):
 
     wbm_html = fetch_wbm(wayback_url)
     raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    #with open(rewritten_path, 'r') as fp:
+    # with open(rewritten_path, 'r') as fp:
     #    soup = BeautifulSoup(fp, "lxml")
     soup = BeautifulSoup(wbm_html, "lxml")
     embeds = extract_embeds(soup)
-    cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url),
-        cdx_output=cdx_output)
+    cdx_obj = lookup_cdx(
+        "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
+    )
     cdx_list = [cdx_obj]
     for url in embeds:
         cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
         cdx_list.append(cdx_obj)
-    archive_urls = [WebcaptureUrl(
-        rel="wayback",
-        url="https://web.archive.org/web/",
-    )]
+    archive_urls = [
+        WebcaptureUrl(
+            rel="wayback",
+            url="https://web.archive.org/web/",
+        )
+    ]
     wc = WebcaptureEntity(
         cdx=cdx_list,
         timestamp=timestamp.isoformat() + "Z",
         original_url=original_url,
         archive_urls=archive_urls,
-        release_ids=None)
+        release_ids=None,
+    )
     return wc
 
+
 def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
     """
     Returns a tuple: (editgroup_id, edit). If failed, both are None
     """
 
     raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
-    git_rev = subprocess.check_output(
-        ["git", "describe", "--always"]).strip().decode('utf-8')
+    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
 
     release = api.get_release(release_id, expand="webcaptures")
 
@@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
     for wc in release.webcaptures:
         if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
             # skipping: already existed
-            print("release {} already had webcapture {} {}".format(
-                release_id, raw_timestamp, original_url))
+            print(
+                "release {} already had webcapture {} {}".format(
+                    release_id, raw_timestamp, original_url
+                )
+            )
             return (None, None)
 
     wc = static_wayback_webcapture(wayback_url)
     assert len(wc.cdx) >= 1
     wc.release_ids = [release_id]
     if not editgroup_id:
-        eg = api.create_editgroup(Editgroup(
-            description="One-off import of static web content from wayback machine",
-            extra=dict(
-                git_rev=git_rev,
-                agent="fatcat_tools.auto_wayback_static")))
+        eg = api.create_editgroup(
+            Editgroup(
+                description="One-off import of static web content from wayback machine",
+                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
+            )
+        )
         editgroup_id = eg.editgroup_id
     edit = api.create_webcapture(eg.editgroup_id, wc)
     return (editgroup_id, edit)
 
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--verbose',
-        action='store_true',
-        help="verbose output")
-    parser.add_argument('wayback_url',
-        type=str,
-        help="URL of wayback capture to extract from")
-    parser.add_argument('--json-output',
-        type=argparse.FileType('w'), default=sys.stdout,
-        help="where to write out webcapture entity (as JSON)")
-    parser.add_argument('--cdx-output',
-        type=argparse.FileType('w'), default=None,
-        help="(optional) file to write out CDX stub")
+    parser.add_argument("--verbose", action="store_true", help="verbose output")
+    parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
+    parser.add_argument(
+        "--json-output",
+        type=argparse.FileType("w"),
+        default=sys.stdout,
+        help="where to write out webcapture entity (as JSON)",
+    )
+    parser.add_argument(
+        "--cdx-output",
+        type=argparse.FileType("w"),
+        default=None,
+        help="(optional) file to write out CDX stub",
+    )
 
     args = parser.parse_args()
 
@@ -254,5 +275,6 @@ def main():
     wc_dict = api_client.sanitize_for_serialization(wc)
     print(json.dumps(wc_dict))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
commit	31d1a6a713d177990609767d508209ced19ca396 (patch)
tree	a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers
parent	9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download	fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip