1 files changed, 356 insertions, 248 deletions
diff --git a/chocula/database.py b/chocula/database.py
index f620515..11632b9 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -1,4 +1,3 @@
-
 from __future__ import annotations
 
 import sys
@@ -47,41 +46,49 @@ class HomepageUrl:
         """
         Returns None if url is really bad (not a URL).
         """
-        if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+        if (
+            not url
+            or "mailto:" in url.lower()
+            or url.lower() in ("http://n/a", "http://na/", "http://na")
+        ):
             return None
-        if url.startswith('www.'):
+        if url.startswith("www."):
             url = "http://" + url
-        if url.startswith('ttp://') or url.startswith('ttps://'):
+        if url.startswith("ttp://") or url.startswith("ttps://"):
             url = "h" + url
-        url.replace('Http://', 'http://')
+        url.replace("Http://", "http://")
 
         url = str(urlcanon.semantic_precise(url))
-        if url == 'http://na/':
+        if url == "http://na/":
             # sort of redundant with above, but some only match after canonicalization
             return None
         url_surt = surt.surt(url)
         tld = tldextract.extract(url)
-        host = '.'.join(tld)
-        if host.startswith('.'):
+        host = ".".join(tld)
+        if host.startswith("."):
             host = host[1:]
-        return HomepageUrl(url=url,
-                    surt=url_surt,
-                    host=host,
-                    domain=tld.registered_domain,
-                    suffix=tld.suffix)
+        return HomepageUrl(
+            url=url,
+            surt=url_surt,
+            host=host,
+            domain=tld.registered_domain,
+            suffix=tld.suffix,
+        )
+
 
 def test_from_url():
-    
-    assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == 'core.ac.uk'
-    assert HomepageUrl.from_url("http://thing.core.ac.uk").host == 'thing.core.ac.uk'
-    assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix== 'ac.uk'
 
-    assert HomepageUrl.from_url("google.com").suffix == 'com'
-    assert HomepageUrl.from_url("google.com").host == 'google.com'
+    assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == "core.ac.uk"
+    assert HomepageUrl.from_url("http://thing.core.ac.uk").host == "thing.core.ac.uk"
+    assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix == "ac.uk"
+
+    assert HomepageUrl.from_url("google.com").suffix == "com"
+    assert HomepageUrl.from_url("google.com").host == "google.com"
 
     assert HomepageUrl.from_url("mailto:bnewbold@bogus.com") == None
-    assert HomepageUrl.from_url("thing.com").url == 'http://thing.com/'
-    assert HomepageUrl.from_url("Http://thing.com///").url == 'http://thing.com/'
+    assert HomepageUrl.from_url("thing.com").url == "http://thing.com/"
+    assert HomepageUrl.from_url("Http://thing.com///").url == "http://thing.com/"
+
 
 @dataclass
 class UrlCrawlStatus:
@@ -95,6 +102,7 @@ class UrlCrawlStatus:
     gwb_url_success_dt: Optional[str]
     gwb_terminal_url_success_dt: Optional[str]
 
+
 @dataclass
 class DirectoryInfo:
     directory_slug: str
@@ -127,10 +135,19 @@ class DirectoryInfo:
         """
         if not self.issnl:
             raise ValueError
-        extra_dict  = self.extra
-
-        for k in ('issne', 'issnp', 'name', 'publisher', 'abbrev', 'platform',
-                  'country', 'langs', 'original_name'):
+        extra_dict = self.extra
+
+        for k in (
+            "issne",
+            "issnp",
+            "name",
+            "publisher",
+            "abbrev",
+            "platform",
+            "country",
+            "langs",
+            "original_name",
+        ):
             if self.__dict__[k]:
                 extra_dict[k] = self.__dict__[k]
 
@@ -151,7 +168,7 @@ class DirectoryInfo:
         raise NotImplementedError()
 
 
-class IssnDatabase():
+class IssnDatabase:
     """
     Holds complete ISSN/ISSN-L table and helps with lookups and munging of raw
     ISSN strings
@@ -163,7 +180,7 @@ class IssnDatabase():
 
     def read_issn_map_file(self, issn_map_path: str):
         print("##### Loading ISSN-L map file...", file=sys.stderr)
-        with open(issn_map_path, 'r') as issn_map_file:
+        with open(issn_map_path, "r") as issn_map_file:
             for line in issn_map_file:
                 if line.startswith("ISSN") or len(line) == 0:
                     continue
@@ -209,7 +226,7 @@ class IssnDatabase():
         return info
 
 
-class ChoculaDatabase():
+class ChoculaDatabase:
     """
     Wraps a sqlite3 database
     """
@@ -218,7 +235,7 @@ class ChoculaDatabase():
         """
         To create a temporary database, pass ":memory:" as db_file
         """
-        self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+        self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
         self.data = dict()
         self.issn_db = issn_db
 
@@ -247,8 +264,7 @@ class ChoculaDatabase():
             cur = self.db.cursor()
 
         try:
-            cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)",
-                info.to_db_tuple())
+            cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)", info.to_db_tuple())
         except sqlite3.IntegrityError as ie:
             if str(ie).startswith("UNIQUE"):
                 return "duplicate"
@@ -264,7 +280,8 @@ class ChoculaDatabase():
         try:
             cur.execute(
                 "INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)",
-                homepage.to_db_tuple(issnl))
+                homepage.to_db_tuple(issnl),
+            )
         except sqlite3.IntegrityError as ie:
             if str(ie).startswith("UNIQUE"):
                 return "duplicate"
@@ -276,29 +293,33 @@ class ChoculaDatabase():
         print("##### Loading IA Homepage Crawl Results...")
         counts: Counter = Counter()
         cur = self.db.cursor()
-        for line in open(config.homepage_status.filepath, 'r'):
+        for line in open(config.homepage_status.filepath, "r"):
             if not line.strip():
                 continue
             row = json.loads(line)
-            counts['total'] += 1
-            url = row['url']
-            assert(url)
-            if row.get('gwb_url_success_dt') == 'error':
-                row['gwb_url_success_dt'] = None
-            if row.get('gwb_terminal_url_success_dt') == 'error':
-                row['gwb_terminal_url_success_dt'] = None
-            cur.execute("UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
-                (row['status_code'],
-                 row.get('crawl_error'),
-                 row.get('terminal_url'),
-                 row.get('terminal_status_code'),
-                 row.get('platform_software'),
-                 row.get('issnl_in_body'),
-                 row.get('blocked'),
-                 row.get('gwb_url_success_dt'),
-                 row.get('gwb_terminal_url_success_dt'),
-                 url))
-            counts['updated'] += 1
+            counts["total"] += 1
+            url = row["url"]
+            assert url
+            if row.get("gwb_url_success_dt") == "error":
+                row["gwb_url_success_dt"] = None
+            if row.get("gwb_terminal_url_success_dt") == "error":
+                row["gwb_terminal_url_success_dt"] = None
+            cur.execute(
+                "UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
+                (
+                    row["status_code"],
+                    row.get("crawl_error"),
+                    row.get("terminal_url"),
+                    row.get("terminal_status_code"),
+                    row.get("platform_software"),
+                    row.get("issnl_in_body"),
+                    row.get("blocked"),
+                    row.get("gwb_url_success_dt"),
+                    row.get("gwb_terminal_url_success_dt"),
+                    url,
+                ),
+            )
+            counts["updated"] += 1
         cur.close()
         self.db.commit()
         return counts
@@ -306,51 +327,54 @@ class ChoculaDatabase():
     def load_fatcat_containers(self, config: ChoculaConfig) -> Counter:
         print("##### Loading Fatcat Container Entities...")
         # JSON
-        json_file = open(config.fatcat_containers.filepath, 'r')
+        json_file = open(config.fatcat_containers.filepath, "r")
         counts: Counter = Counter()
         cur = self.db.cursor()
         for line in json_file:
             if not line:
                 continue
             row = json.loads(line)
-            if row['state'] != 'active':
+            if row["state"] != "active":
                 continue
-            counts['total'] += 1
-            extra = row.get('extra', dict())
-            issne = extra.get('issne')
-            issnp = extra.get('issnp')
-            country = extra.get('country')
-            languages = extra.get('languages', [])
+            counts["total"] += 1
+            extra = row.get("extra", dict())
+            issne = extra.get("issne")
+            issnp = extra.get("issnp")
+            country = extra.get("country")
+            languages = extra.get("languages", [])
             lang = None
             if languages:
                 lang = languages[0]
             try:
-                cur.execute("INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
-                    (row.get('issnl'),
-                     row['ident'],
-                     row['revision'],
-                     issne,
-                     issnp,
-                     row.get('wikidata_qid'),
-                     row['name'],
-                     row.get('container_type'),
-                     extra.get('publisher'),
-                     country,
-                     lang,
-                    ))
+                cur.execute(
+                    "INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
+                    (
+                        row.get("issnl"),
+                        row["ident"],
+                        row["revision"],
+                        issne,
+                        issnp,
+                        row.get("wikidata_qid"),
+                        row["name"],
+                        row.get("container_type"),
+                        extra.get("publisher"),
+                        country,
+                        lang,
+                    ),
+                )
             except sqlite3.IntegrityError as ie:
                 if str(ie).startswith("UNIQUE"):
                     counts["existing"] += 1
                     continue
                 else:
                     raise ie
-            counts['inserted'] += 1
-            if row.get('issnl'):
-                urls = extra.get('urls', [])
+            counts["inserted"] += 1
+            if row.get("issnl"):
+                urls = extra.get("urls", [])
                 for url in urls:
                     homepage = HomepageUrl.from_url(url)
                     if homepage:
-                        self.insert_homepage(row.get('issnl'), homepage, cur)
+                        self.insert_homepage(row.get("issnl"), homepage, cur)
         cur.close()
         self.db.commit()
         return counts
@@ -358,22 +382,31 @@ class ChoculaDatabase():
     def load_fatcat_stats(self, config: ChoculaConfig) -> Counter:
         print("##### Loading Fatcat Container Stats...")
         # JSON
-        json_file = open(config.fatcat_stats.filepath, 'r')
+        json_file = open(config.fatcat_stats.filepath, "r")
         counts: Counter = Counter()
         cur = self.db.cursor()
         for line in json_file:
             if not line:
                 continue
             row = json.loads(line)
-            total = int(row['total'])
+            total = int(row["total"])
             ia_frac: Optional[float] = None
             preserved_frac: Optional[float] = None
             if total > 0:
-                ia_frac = float(row['in_web'])/total
-                preserved_frac = float(row['is_preserved'])/total
-            cur.execute("UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?",
-                (total, row['in_web'], ia_frac, row['is_preserved'], preserved_frac, row['issnl']))
-            counts['updated'] += 1
+                ia_frac = float(row["in_web"]) / total
+                preserved_frac = float(row["is_preserved"]) / total
+            cur.execute(
+                "UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?",
+                (
+                    total,
+                    row["in_web"],
+                    ia_frac,
+                    row["is_preserved"],
+                    preserved_frac,
+                    row["issnl"],
+                ),
+            )
+            counts["updated"] += 1
         cur.close()
         self.db.commit()
         return counts
@@ -384,10 +417,10 @@ class ChoculaDatabase():
         self.db.row_factory = sqlite3.Row
         cur = self.db.execute("SELECT issnl, url FROM homepage;")
         for hrow in cur:
-            assert(hrow['url'])
-            assert(len(hrow['url'].split()) == 1)
-            counts['total'] += 1
-            print('\t'.join((hrow['issnl'], hrow['url'])))
+            assert hrow["url"]
+            assert len(hrow["url"].split()) == 1
+            counts["total"] += 1
+            print("\t".join((hrow["issnl"], hrow["url"])))
         return counts
 
     def summarize(self) -> Counter:
@@ -395,135 +428,189 @@ class ChoculaDatabase():
         counts: Counter = Counter()
         cur = self.db.cursor()
         self.db.row_factory = sqlite3.Row
-        index_issnls = list(cur.execute('SELECT DISTINCT issnl FROM directory'))
-        fatcat_issnls = list(cur.execute('SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null'))
+        index_issnls = list(cur.execute("SELECT DISTINCT issnl FROM directory"))
+        fatcat_issnls = list(
+            cur.execute(
+                "SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null"
+            )
+        )
         all_issnls = set([i[0] for i in index_issnls + fatcat_issnls])
         print("{} total ISSN-Ls".format(len(all_issnls)))
         for issnl in all_issnls:
-            #print(issnl)
-            counts['total'] += 1
+            # print(issnl)
+            counts["total"] += 1
 
             out = dict()
 
             # check if ISSN-L is good. this is here because of fatcat import
-            out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl)
-            if not out['known_issnl']:
-                counts['unknown-issnl'] += 1
-            out['valid_issnl'] = stdnum.issn.is_valid(issnl)
-            if not out['valid_issnl']:
-                counts['invalid-issnl'] += 1
-
-            fatcat_row = list(self.db.execute("SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl]))
+            out["known_issnl"] = self.issn_db.issn2issnl(issnl) == issnl
+            if not out["known_issnl"]:
+                counts["unknown-issnl"] += 1
+            out["valid_issnl"] = stdnum.issn.is_valid(issnl)
+            if not out["valid_issnl"]:
+                counts["invalid-issnl"] += 1
+
+            fatcat_row = list(
+                self.db.execute(
+                    "SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl]
+                )
+            )
             if fatcat_row:
                 frow = fatcat_row[0]
-                out['fatcat_ident'] = frow['ident']
-                for k in ('name', 'publisher', 'issne', 'issnp', 'wikidata_qid', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'):
+                out["fatcat_ident"] = frow["ident"]
+                for k in (
+                    "name",
+                    "publisher",
+                    "issne",
+                    "issnp",
+                    "wikidata_qid",
+                    "lang",
+                    "country",
+                    "release_count",
+                    "ia_count",
+                    "ia_frac",
+                    "kbart_count",
+                    "kbart_frac",
+                    "preserved_count",
+                    "preserved_frac",
+                ):
                     if not out.get(k) and frow[k] != None:
                         out[k] = frow[k]
 
             cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [issnl])
             for irow in cur:
-                if irow['slug'] in ('crossref',):
-                    out['has_dois'] = True
+                if irow["slug"] in ("crossref",):
+                    out["has_dois"] = True
                 # TODO: other DOI registrars (japan, datacite)
-                if irow['slug'] == 'wikidata':
-                    out['wikidata_qid'] = irow['identifier']
-                for k in ('name',):
+                if irow["slug"] == "wikidata":
+                    out["wikidata_qid"] = irow["identifier"]
+                for k in ("name",):
                     if not out.get(k) and irow[k]:
                         out[k] = irow[k]
-                if irow['extra']:
-                    extra = json.loads(irow['extra'])
-                    for k in ('country', 'lang', 'issne', 'issnp', 'publisher', 'platform'):
+                if irow["extra"]:
+                    extra = json.loads(irow["extra"])
+                    for k in (
+                        "country",
+                        "lang",
+                        "issne",
+                        "issnp",
+                        "publisher",
+                        "platform",
+                    ):
                         if not out.get(k) and extra.get(k):
                             out[k] = extra[k]
-                if irow['slug'] in ('doaj','road','szczepanski', 'gold_oa'):
-                    out['is_oa'] = True
-                if irow['slug'] == 'ezb':
-                    ezb_extra = json.loads(irow['extra'])
-                    if ezb_extra['ezb_color'] == 'green':
-                        out['is_oa'] = True
-                if irow['slug'] == 'sherpa_romeo':
-                    extra = json.loads(irow['extra'])
-                    out['sherpa_color'] = extra['sherpa_romeo']['color']
-                    if extra['sherpa_romeo']['color'] == 'green':
-                        out['is_oa'] = True
+                if irow["slug"] in ("doaj", "road", "szczepanski", "gold_oa"):
+                    out["is_oa"] = True
+                if irow["slug"] == "ezb":
+                    ezb_extra = json.loads(irow["extra"])
+                    if ezb_extra["ezb_color"] == "green":
+                        out["is_oa"] = True
+                if irow["slug"] == "sherpa_romeo":
+                    extra = json.loads(irow["extra"])
+                    out["sherpa_color"] = extra["sherpa_romeo"]["color"]
+                    if extra["sherpa_romeo"]["color"] == "green":
+                        out["is_oa"] = True
 
             # filter out "NA" ISSNs
-            for k in ('issne', 'issnp'):
-                if out.get(k) and (len(out[k]) != 9 or out[k][4] != '-'):
+            for k in ("issne", "issnp"):
+                if out.get(k) and (len(out[k]) != 9 or out[k][4] != "-"):
                     out.pop(k)
 
             cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [issnl])
             for hrow in cur:
-                out['any_homepage'] = True
-                if hrow['terminal_status_code'] == 200 and hrow['host'] != 'web.archive.org':
-                    out['any_live_homepage'] = True
-                if hrow['gwb_url_success_dt'] or hrow['gwb_terminal_url_success_dt']:
-                    out['any_gwb_homepage'] = True
-
-            if out.get('wikidata_qid'):
-                assert out['wikidata_qid'].startswith('Q')
-                assert out['wikidata_qid'][1].isdigit()
-                assert out['wikidata_qid'][-1].isdigit()
+                out["any_homepage"] = True
+                if (
+                    hrow["terminal_status_code"] == 200
+                    and hrow["host"] != "web.archive.org"
+                ):
+                    out["any_live_homepage"] = True
+                if hrow["gwb_url_success_dt"] or hrow["gwb_terminal_url_success_dt"]:
+                    out["any_gwb_homepage"] = True
+
+            if out.get("wikidata_qid"):
+                assert out["wikidata_qid"].startswith("Q")
+                assert out["wikidata_qid"][1].isdigit()
+                assert out["wikidata_qid"][-1].isdigit()
 
             # define publisher types
-            publisher = out.get('publisher')
-            pl = out.get('publisher', '').lower().strip()
-            if out.get('platform') == 'scielo':
-                out['publisher_type'] = 'scielo'
-            elif publisher in BIG5_PUBLISHERS or 'elsevier' in pl or 'springer' in pl or 'wiley' in pl:
-                out['publisher_type'] = 'big5'
+            publisher = out.get("publisher")
+            pl = out.get("publisher", "").lower().strip()
+            if out.get("platform") == "scielo":
+                out["publisher_type"] = "scielo"
+            elif (
+                publisher in BIG5_PUBLISHERS
+                or "elsevier" in pl
+                or "springer" in pl
+                or "wiley" in pl
+            ):
+                out["publisher_type"] = "big5"
             elif publisher in OA_PUBLISHERS:
-                out['publisher_type'] = 'oa'
-            elif publisher in COMMERCIAL_PUBLISHERS or 'wolters kluwer' in pl or 'wolters-kluwer' in pl:
-                out['publisher_type'] = 'commercial'
+                out["publisher_type"] = "oa"
+            elif (
+                publisher in COMMERCIAL_PUBLISHERS
+                or "wolters kluwer" in pl
+                or "wolters-kluwer" in pl
+            ):
+                out["publisher_type"] = "commercial"
             elif publisher in ARCHIVE_PUBLISHERS:
-                out['publisher_type'] = 'archive'
+                out["publisher_type"] = "archive"
             elif publisher in REPOSITORY_PUBLISHERS:
-                out['publisher_type'] = 'repository'
+                out["publisher_type"] = "repository"
             elif publisher in OTHER_PUBLISHERS:
-                out['publisher_type'] = 'other'
-            elif publisher in SOCIETY_PUBLISHERS or 'society' in pl or 'association' in pl or 'academy of ' in pl or 'institute of' in pl:
-                out['publisher_type'] = 'society'
-            elif publisher in UNI_PRESS_PUBLISHERS or 'university ' in pl:
-                out['publisher_type'] = 'unipress'
-            elif 'scielo' in pl:
-                out['publisher_type'] = 'scielo'
-            elif out.get('is_oa') and (not out.get('has_dois') or out.get('lang') not in (None, 'en', 'de', 'fr', 'ja') or out.get('country') not in (None, 'us', 'gb', 'nl', 'cn', 'jp', 'de')):
+                out["publisher_type"] = "other"
+            elif (
+                publisher in SOCIETY_PUBLISHERS
+                or "society" in pl
+                or "association" in pl
+                or "academy of " in pl
+                or "institute of" in pl
+            ):
+                out["publisher_type"] = "society"
+            elif publisher in UNI_PRESS_PUBLISHERS or "university " in pl:
+                out["publisher_type"] = "unipress"
+            elif "scielo" in pl:
+                out["publisher_type"] = "scielo"
+            elif out.get("is_oa") and (
+                not out.get("has_dois")
+                or out.get("lang") not in (None, "en", "de", "fr", "ja")
+                or out.get("country") not in (None, "us", "gb", "nl", "cn", "jp", "de")
+            ):
                 # current informal definition of longtail
-                out['publisher_type'] = 'longtail'
-                out['is_longtail'] = True
-
-            cur.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
-                (issnl,
-                 out.get('issne'),
-                 out.get('issnp'),
-                 out.get('wikidata_qid'),
-                 out.get('fatcat_ident'),
-                 out.get('name'),
-                 out.get('publisher'),
-                 out.get('country'),
-                 out.get('lang'),
-                 out.get('is_oa', False),
-                 out.get('sherpa_color'),
-                 out.get('is_longtail', False),
-                 out.get('is_active'),
-                 out.get('publisher_type'),
-                 out.get('has_dois', False),
-                 out.get('any_homepage', False),
-                 out.get('any_live_homepage', False),
-                 out.get('any_gwb_homepage', False),
-                 out.get('known_issnl'),
-                 out.get('valid_issnl'),
-
-                 out.get('release_count'),
-                 out.get('ia_count'),
-                 out.get('ia_frac'),
-                 out.get('kbart_count'),
-                 out.get('kbart_frac'),
-                 out.get('preserved_count'),
-                 out.get('preserved_frac'),
-                ))
+                out["publisher_type"] = "longtail"
+                out["is_longtail"] = True
+
+            cur.execute(
+                "INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
+                (
+                    issnl,
+                    out.get("issne"),
+                    out.get("issnp"),
+                    out.get("wikidata_qid"),
+                    out.get("fatcat_ident"),
+                    out.get("name"),
+                    out.get("publisher"),
+                    out.get("country"),
+                    out.get("lang"),
+                    out.get("is_oa", False),
+                    out.get("sherpa_color"),
+                    out.get("is_longtail", False),
+                    out.get("is_active"),
+                    out.get("publisher_type"),
+                    out.get("has_dois", False),
+                    out.get("any_homepage", False),
+                    out.get("any_live_homepage", False),
+                    out.get("any_gwb_homepage", False),
+                    out.get("known_issnl"),
+                    out.get("valid_issnl"),
+                    out.get("release_count"),
+                    out.get("ia_count"),
+                    out.get("ia_frac"),
+                    out.get("kbart_count"),
+                    out.get("kbart_frac"),
+                    out.get("preserved_count"),
+                    out.get("preserved_frac"),
+                ),
+            )
         cur.close()
         self.db.commit()
         return counts
@@ -534,125 +621,146 @@ class ChoculaDatabase():
             for idx, col in enumerate(cursor.description):
                 d[col[0]] = row[idx]
             return d
+
         counts: Counter = Counter()
         self.db.row_factory = dict_factory
         cur = self.db.cursor()
-        for row in cur.execute('SELECT * FROM journal'):
+        for row in cur.execute("SELECT * FROM journal"):
             print(json.dumps(row))
-            counts['total'] += 1
+            counts["total"] += 1
         return counts
 
     def export_fatcat(self):
         counts: Counter = Counter()
         self.db.row_factory = sqlite3.Row
         cur = self.db.cursor()
-        for row in cur.execute('SELECT * FROM journal WHERE valid_issnl = 1'):
-            counts['total'] += 1
+        for row in cur.execute("SELECT * FROM journal WHERE valid_issnl = 1"):
+            counts["total"] += 1
 
-            name = row['name']
+            name = row["name"]
             if name:
                 name = name.strip()
 
-            if not row['name']:
-                counts['empty-name'] += 1
+            if not row["name"]:
+                counts["empty-name"] += 1
                 continue
 
             if len(name) <= 2:
-                counts['short-name'] += 1
+                counts["short-name"] += 1
                 continue
 
-            publisher = row['publisher']
+            publisher = row["publisher"]
             if publisher:
                 publisher = publisher.strip() or None
 
             out = dict(
-                issnl=row['issnl'],
-                wikidata_qid=row['wikidata_qid'],
-                ident=row['fatcat_ident'],
+                issnl=row["issnl"],
+                wikidata_qid=row["wikidata_qid"],
+                ident=row["fatcat_ident"],
                 publisher=publisher,
                 name=name,
-                _known_issnl=row['known_issnl'])
+                _known_issnl=row["known_issnl"],
+            )
 
             extra = dict(
-                issnp=row['issnp'],
-                issne=row['issne'],
-                country=row['country'],
+                issnp=row["issnp"], issne=row["issne"], country=row["country"],
             )
-            if row['lang']:
-                extra['languages'] = [row['lang'],]
-            if row['sherpa_color']:
-                extra['sherpa_romeo'] = dict(color=row['sherpa_color'])
+            if row["lang"]:
+                extra["languages"] = [
+                    row["lang"],
+                ]
+            if row["sherpa_color"]:
+                extra["sherpa_romeo"] = dict(color=row["sherpa_color"])
 
             urls = []
             webarchive_urls = []
-            cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']])
+            cur = self.db.execute(
+                "SELECT * FROM homepage WHERE issnl = ?;", [row["issnl"]]
+            )
             for hrow in cur:
-                if '://doaj.org/' in hrow['url'] or '://www.doaj.org/' in hrow['url']:
+                if "://doaj.org/" in hrow["url"] or "://www.doaj.org/" in hrow["url"]:
                     continue
-                if '://www.ncbi.nlm.nih.gov/' in hrow['url']:
+                if "://www.ncbi.nlm.nih.gov/" in hrow["url"]:
                     continue
-                if 'web.archive.org/web' in hrow['url']:
-                    webarchive_urls.append(hrow['url'])
-                    urls.append(hrow['url'])
+                if "web.archive.org/web" in hrow["url"]:
+                    webarchive_urls.append(hrow["url"])
+                    urls.append(hrow["url"])
                     continue
-                if hrow['host'] in ('www.google.com', 'books.google.com'):
+                if hrow["host"] in ("www.google.com", "books.google.com"):
                     # individual books or google searches, not journal/conference homepages
                     continue
-                if '/oai/request' in hrow['url']:
+                if "/oai/request" in hrow["url"]:
                     # OAI-PMH endpoints, not homepages
                     continue
-                if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error':
-                    webarchive_urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url']))
+                if (
+                    not row["any_live_homepage"]
+                    and hrow["gwb_url_success_dt"]
+                    and hrow["gwb_url_success_dt"] != "error"
+                ):
+                    webarchive_urls.append(
+                        "https://web.archive.org/web/{}/{}".format(
+                            hrow["gwb_url_success_dt"], hrow["url"]
+                        )
+                    )
                     continue
-                if hrow['blocked']:
-                    urls.append(hrow['url'])
+                if hrow["blocked"]:
+                    urls.append(hrow["url"])
                     continue
-                if hrow['terminal_status_code'] == 200:
-                    if hrow['terminal_url'] == hrow['url'].replace('http://', 'https://') or hrow['terminal_url'] == hrow['url'] + "/":
+                if hrow["terminal_status_code"] == 200:
+                    if (
+                        hrow["terminal_url"]
+                        == hrow["url"].replace("http://", "https://")
+                        or hrow["terminal_url"] == hrow["url"] + "/"
+                    ):
                         # check for trivial redirects; use post-redirect URL in those cases
-                        urls.append(hrow['terminal_url'])
+                        urls.append(hrow["terminal_url"])
                     else:
-                        urls.append(hrow['url'])
+                        urls.append(hrow["url"])
                     continue
                 # didn't even crawl and no match? add anyways as a pass-through
-                if not hrow['status_code']:
-                    urls.append(hrow['url'])
+                if not hrow["status_code"]:
+                    urls.append(hrow["url"])
                     continue
-            extra['webarchive_urls'] = webarchive_urls
-            extra['urls'] = urls
+            extra["webarchive_urls"] = webarchive_urls
+            extra["urls"] = urls
 
-            cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [row['issnl']])
+            cur = self.db.execute(
+                "SELECT * FROM directory WHERE issnl = ?;", [row["issnl"]]
+            )
             for drow in cur:
-                if drow['slug'] == 'ezb':
-                    ezb = json.loads(drow['extra'])
-                    extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color'])
-                elif drow['slug'] == 'szczepanski':
-                    extra['szczepanski'] = drow['extra']
-                elif drow['slug'] == 'doaj':
-                    extra['doaj'] = json.loads(drow['extra'])
-                elif drow['slug'] == 'scielo':
-                    extra['scielo'] = json.loads(drow['extra'])
-                elif drow['slug'] == 'sim':
-                    extra['ia'] = extra.get('ia', {})
-                    extra['ia']['sim'] = json.loads(drow['extra'])
-                    extra['ia']['sim']['sim_pubid'] = drow['identifier']
-                elif drow['slug'] in ('lockss', 'clockss', 'portico', 'jstor'):
-                    extra['kbart'] = extra.get('kbart', {})
-                    extra['kbart'][drow['slug']] = json.loads(drow['extra'])
-
-            out['extra'] = extra
+                if drow["slug"] == "ezb":
+                    ezb = json.loads(drow["extra"])
+                    extra["ezb"] = dict(
+                        ezb_id=drow["identifier"], color=ezb["ezb_color"]
+                    )
+                elif drow["slug"] == "szczepanski":
+                    extra["szczepanski"] = drow["extra"]
+                elif drow["slug"] == "doaj":
+                    extra["doaj"] = json.loads(drow["extra"])
+                elif drow["slug"] == "scielo":
+                    extra["scielo"] = json.loads(drow["extra"])
+                elif drow["slug"] == "sim":
+                    extra["ia"] = extra.get("ia", {})
+                    extra["ia"]["sim"] = json.loads(drow["extra"])
+                    extra["ia"]["sim"]["sim_pubid"] = drow["identifier"]
+                elif drow["slug"] in ("lockss", "clockss", "portico", "jstor"):
+                    extra["kbart"] = extra.get("kbart", {})
+                    extra["kbart"][drow["slug"]] = json.loads(drow["extra"])
+
+            out["extra"] = extra
             print(json.dumps(out))
         return counts
 
     def init_db(self):
         print("### Creating Database...", file=sys.stderr)
-        self.db.executescript("""
+        self.db.executescript(
+            """
             PRAGMA main.page_size = 4096;
             PRAGMA main.cache_size = 20000;
             PRAGMA main.locking_mode = EXCLUSIVE;
             PRAGMA main.synchronous = OFF;
-        """)
-        with open('chocula_schema.sql', 'r') as fschema:
+        """
+        )
+        with open("chocula_schema.sql", "r") as fschema:
             self.db.executescript(fschema.read())
         print("Done!", file=sys.stderr)
-