7 files changed, 470 insertions, 458 deletions
diff --git a/chocula/__init__.py b/chocula/__init__.py
new file mode 100644
index 0000000..0b8a5e1
--- /dev/null
+++ b/chocula/__init__.py
@@ -0,0 +1,2 @@
+
+from chocula.chocula import ChoculaDatabase
diff --git a/chocula.py b/chocula/chocula.py
index f34a4e2..27d6f80 100755
--- a/chocula.py
+++ b/chocula/chocula.py
@@ -1,398 +1,14 @@
-#!/usr/bin/env python3
 
-"""
-Count Chocula - online serials metadata and stats
-
-  "one, two, three, un-preserved web-native open-access long-tail indie
-  journals, hah, hah, hah!"
-
-  (yeah, I know, this name isn't very good)
-  (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)
-
-Commands:
-
-    everything
-    init_db
-    summarize
-    export
-    export_fatcat
-
-    index_doaj
-    index_road
-    index_crossref
-    index_entrez
-    index_norwegian
-    index_szczepanski
-    index_ezb
-    index_wikidata
-    index_openapc
-
-    load_fatcat
-    load_fatcat_stats
-
-    export_urls
-    update_url_status
-
-Future commands:
-
-    index_jurn
-    index_datacite
-    preserve_kbart --keeper SLUG
-    preserve_sim
-
-See TODO.md for more work-in-progress
-"""
-
-import sys, csv, json
+import csv
+import json
 from collections import Counter
 import sqlite3
-import argparse
 
 import ftfy
-import urlcanon
-import surt
-import tldextract
-import pycountry
 import stdnum.issn
 
-
-################### File Config
-
-ISSNL_FILE = 'data/20200323.ISSN-to-ISSN-L.txt'
-
-ENTREZ_FILE = 'data/entrez-journals.csv'
-ROAD_FILE = 'data/road-2018-01-24.tsv'
-ROAD_DATE = '2018-01-24'
-DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv'
-DOAJ_DATE = '2019-12-21'
-CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv'
-SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv'
-SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv'
-NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv'
-NORWEGIAN_DATE = '2019-12-21'
-LOCKSS_FILE = 'data/kbart_LOCKSS.txt'
-CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt'
-PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
-JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt'
-SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
-SZCZEPANSKI_DATE = '2018'
-SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json'
-EZB_FILE = 'data/ezb_metadata.json'
-GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv'
-WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv'
-OPENAPC_FILE = 'data/apc_de.2019-12-20.csv'
-FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json'
-
-IA_CRAWL_FILE = 'data/url_status.20191223.json'
-FATCAT_STATS_FILE = 'data/container_stats.20191213.json'
-
-
-################### Utilities
-
-# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
-# software frameworks
-PLATFORM_MAP = {
-    'OJS': 'ojs',
-    'BMC': 'bmc',
-    'SciELO Brazil': 'scielo',
-    'SciELO Argentina': 'scielo',
-    'SciELO': 'scielo',
-    'SciELO Mexico': 'scielo',
-    'SciELO Spain': 'scielo',
-    'SciELO Portugal': 'scielo',
-    'WordPress': 'wordpress',
-    'Sciendo': 'sciendo',
-    'Drupal': 'drupal',
-    'revues.org': 'openedition',
-}
-
-MIMETYPE_MAP = {
-    'PDF': 'application/pdf',
-    'HTML': 'text/html',
-    'XML': 'application/xml',
-}
-
-BIG5_PUBLISHERS = [
-    'Elsevier',
-    'Informa UK (Taylor & Francis)',
-    'Springer-Verlag',
-    'SAGE Publications',
-    'Wiley (Blackwell Publishing)',
-    'Wiley (John Wiley & Sons)',
-    'Springer (Biomed Central Ltd.)',
-    'Springer Nature',
-]
-COMMERCIAL_PUBLISHERS = [
-    'Peter Lang International Academic Publishers',
-    'Walter de Gruyter GmbH',
-    'Oldenbourg Wissenschaftsverlag',
-    'Georg Thieme Verlag KG', # not springer
-    'Emerald (MCB UP )',
-    'Medknow Publications',
-    'Inderscience Enterprises Ltd',
-    'Bentham Science',
-    'Ovid Technologies (Wolters Kluwer)  - Lippincott Williams & Wilkins',
-    'Scientific Research Publishing, Inc',
-    'MDPI AG',
-    'S. Karger AG',
-    'Pleiades Publishing',
-    'Science Publishing Group',
-    'IGI Global',
-    'The Economist Intelligence Unit',
-    'Maney Publishing',
-    'Diva Enterprises Private Limited',
-    'World Scientific',
-    'Mary Ann Liebert',
-    'Trans Tech Publications',
-]
-OA_PUBLISHERS = [
-    'Hindawi Limited',
-    'OMICS Publishing Group',
-    'De Gruyter Open Sp. z o.o.',
-    'OpenEdition',
-    'Hindawi (International Scholarly Research Network)',
-    'Public Library of Science',
-    'Frontiers Media SA',
-    'eLife Sciences Publications, Ltd',
-    'MDPI AG',
-    'Hindawi (International Scholarly Research Network)',
-    'Dove Medical Press',
-    'Open Access Text',
-]
-SOCIETY_PUBLISHERS = [
-    'Institute of Electrical and Electronics Engineers',
-    'Institution of Electrical Engineers',
-    'Association for Computing Machinery',
-    'American Psychological Association',
-    'IOS Press',
-    'IOP Publishing',
-    'American Chemical Society',
-    'Royal Society of Chemistry (RSC)',
-    'American Geophysical Union',
-    'American College of Physicians',
-    'New England Journal of Medicine',
-    'BMJ',
-    'RCN Publishing',
-    'International Union of Crystallography',
-    'Portland Press',
-    'ASME International',
-]
-UNI_PRESS_PUBLISHERS = [
-    'Cambridge University Press',
-    'Oxford University Press',
-    'The University of Chicago Press',
-    'MIT Press',
-]
-ARCHIVE_PUBLISHERS = [
-    'JSTOR',
-    'Portico',
-]
-REPOSITORY_PUBLISHERS = [
-    'PERSEE Program',
-    'Social Science Electronic Publishing',
-    'CAIRN',
-    'CSIRO Publishing',
-]
-OTHER_PUBLISHERS = [
-    'African Journals Online',
-    'Smithsonian Institution Biodiversity Heritage Library',
-    'Canadian Science Publishing',
-    'Philosophy Documentation Center',
-    'Project MUSE',
-]
-
-def unquote(s):
-    if s.startswith('"'):
-        s = s[1:]
-    if s.endswith('"'):
-        s = s[:-1]
-    if s.endswith('.'):
-        s = s[:-1]
-    return s.strip()
-
-def parse_lang(s):
-    if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
-        return None
-    try:
-        if len(s) == 2:
-            lang = pycountry.languages.get(alpha2=s.lower())
-        elif len(s) == 3:
-            lang = pycountry.languages.get(alpha3=s.lower())
-        else:
-            lang = pycountry.languages.get(name=s)
-        return lang.alpha2.lower()
-    except KeyError:
-        return None
-    except AttributeError:
-        return None
-
-def parse_country(s):
-    if not s or s in ('Unknown'):
-        return None
-    try:
-        if len(s) == 2:
-            country = pycountry.countries.get(alpha2=s.lower())
-        else:
-            country = pycountry.countries.get(name=s)
-    except KeyError:
-        return None
-    if country:
-        return country.alpha_2.lower()
-    else:
-        return None
-
-def parse_mimetypes(val):
-    # XXX: multiple mimetypes?
-    if not val:
-        return
-    mimetype = None
-    if '/' in val:
-        mimetype = val
-    else:
-        mimetype = MIMETYPE_MAP.get(val)
-    if not mimetype:
-        return None
-    return [mimetype]
-
-def gaps_to_spans(first, last, gaps):
-    if not gaps:
-        return [[first, last]]
-    if not (last >= first and max(gaps) < last and min(gaps) > first):
-        # mangled
-        print("mangled years: {}".format((first, last, gaps)))
-        return []
-    full = list(range(first, last+1))
-    for missing in gaps:
-        full.remove(missing)
-    spans = []
-    low = None
-    last = None
-    for year in full:
-        if not low:
-            low = year
-            last = year
-            continue
-        if year != last+1:
-            spans.append([low, last])
-            low = year
-            last = year
-        last = year
-    if low:
-        spans.append([low, last])
-    return spans
-
-def test_gaps():
-    assert gaps_to_spans(1900, 1900, None) == \
-        [[1900, 1900]]
-    assert gaps_to_spans(1900, 1903, None) == \
-        [[1900, 1903]]
-    assert gaps_to_spans(1900, 1902, [1901]) == \
-        [[1900, 1900], [1902, 1902]]
-    assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
-        [[1950, 1954], [1957, 1964], [1966, 1970]]
-
-def merge_spans(old, new):
-    if not new:
-        return old
-    if not old:
-        old = []
-    old.extend(new)
-    years = set()
-    for span in old:
-        for y in range(span[0], span[1]+1):
-            years.add(y)
-    if not years:
-        return []
-    spans = []
-    start = None
-    last = None
-    todo = False
-    for y in sorted(list(years)):
-        if start == None:
-            # very first
-            start = y
-            last = y
-            todo = True
-            continue
-        if y == last + 1:
-            # span continues
-            last = y
-            todo = True
-            continue
-        # a gap just happened!
-        spans.append([start, last])
-        start = y
-        last = y
-        todo = True
-    if todo:
-        spans.append([start, last])
-    return spans
-
-def test_merge_spans():
-    assert merge_spans([[5, 10]], [[10, 20]]) == \
-        [[5, 20]]
-    assert merge_spans([[5, 9]], [[10, 20]]) == \
-        [[5, 20]]
-    assert merge_spans([[5, 11]], [[10, 20]]) == \
-        [[5, 20]]
-    assert merge_spans([], []) == \
-        []
-    assert merge_spans([[9, 11]], []) == \
-        [[9,11]]
-    assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
-        [[1450, 1900], [2000, 2000]]
-
-
-def parse_url(url):
-    """
-    Parses/cleans URLs.
-
-    Returns a dict with:
-        
-        url: str, cleaned/normalized URL
-        url_surt: str, "sortable url" (a web-archiving format)
-        host: str, full hostname
-        registered_domain: "primary domain", eg "google.com" or "thing.co.uk"
-        suffix: str, eg "com" or "co.uk"
-
-    Returns None if url is really bad (not a URL).
-    """
-    if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
-        return None
-    if url.startswith('www.'):
-        url = "http://" + url
-    if url.startswith('ttp://') or url.startswith('ttps://'):
-        url = "h" + url
-    url.replace('Http://', 'http://')
-
-    url = str(urlcanon.semantic_precise(url))
-    if url == 'http://na/':
-        # sort of redundant with above, but some only match after canonicalization
-        return None
-    url_surt = surt.surt(url)
-    tld = tldextract.extract(url)
-    host = '.'.join(tld)
-    if host.startswith('.'):
-        host = host[1:]
-    return dict(url=url,
-                url_surt=url_surt or None,
-                host=host or None,
-                registered_domain=tld.registered_domain or None,
-                suffix=tld.suffix or None)
-
-def test_parse_url():
-    
-    assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk'
-    assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
-    assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
-
-    assert parse_url("google.com")['suffix'] == 'com'
-    assert parse_url("google.com")['host'] == 'google.com'
-
-    assert parse_url("mailto:bnewbold@bogus.com") == None
-    assert parse_url("thing.com")['url'] == 'http://thing.com/'
-    assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'
-
+from chocula.config import *
+from chocula.util import *
 
 ################### Main Class
 
@@ -1397,73 +1013,3 @@ class ChoculaDatabase():
             self.db.executescript(fschema.read())
         print("Done!")
 
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    subparsers = parser.add_subparsers()
-
-    parser.add_argument("--db-file",
-        help="run in mode that considers only terminal HTML success",
-        default='chocula.sqlite',
-        type=str)
-    parser.add_argument("--input-file",
-        help="override default input file path",
-        default=None,
-        type=str)
-
-    sub = subparsers.add_parser('everything',
-        help="run all the commands")
-    sub.set_defaults(func='everything')
-
-    sub = subparsers.add_parser('init_db',
-        help="create sqlite3 output file and tables")
-    sub.set_defaults(func='init_db')
-
-    sub = subparsers.add_parser('summarize',
-        help="aggregate metadata from all tables into 'journals' table")
-    sub.set_defaults(func='summarize')
-
-    sub = subparsers.add_parser('export',
-        help="dump JSON output")
-    sub.set_defaults(func='export')
-
-    sub = subparsers.add_parser('export_fatcat',
-        help="dump JSON output in a format that can load into fatcat")
-    sub.set_defaults(func='export_fatcat')
-
-    # TODO: 'jurn'
-    for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'):
-        sub = subparsers.add_parser('index_{}'.format(ind),
-            help="load metadata from {}".format(ind))
-        sub.set_defaults(func='index_{}'.format(ind))
-
-    sub = subparsers.add_parser('load_fatcat',
-        help="load fatcat container metadata")
-    sub.set_defaults(func='load_fatcat')
-
-    sub = subparsers.add_parser('load_fatcat_stats',
-        help="update container-level stats from JSON file")
-    sub.set_defaults(func='load_fatcat_stats')
-
-    sub = subparsers.add_parser('export_urls',
-        help="dump homepage URLs (eg, to crawl for status)")
-    sub.set_defaults(func='export_urls')
-
-    sub = subparsers.add_parser('update_url_status',
-        help="import homepage URL crawl status")
-    sub.set_defaults(func='update_url_status')
-
-    args = parser.parse_args()
-    if not args.__dict__.get("func"):
-        print("tell me what to do! (try --help)")
-        sys.exit(-1)
-
-    cdb = ChoculaDatabase(args.db_file)
-    if args.func.startswith('index_') or args.func in ('everything','summarize',):
-        cdb.read_issn_map_file(ISSNL_FILE)
-    func = getattr(cdb, args.func)
-    func(args)
-
-if __name__ == '__main__':
-    main()
-
diff --git a/chocula/config.py b/chocula/config.py
new file mode 100644
index 0000000..a32bdd1
--- /dev/null
+++ b/chocula/config.py
@@ -0,0 +1,30 @@
+
+################### File Config
+
+ISSNL_FILE = 'data/20200323.ISSN-to-ISSN-L.txt'
+
+ENTREZ_FILE = 'data/entrez-journals.csv'
+ROAD_FILE = 'data/road-2018-01-24.tsv'
+ROAD_DATE = '2018-01-24'
+DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv'
+DOAJ_DATE = '2019-12-21'
+CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv'
+SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv'
+SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv'
+NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv'
+NORWEGIAN_DATE = '2019-12-21'
+LOCKSS_FILE = 'data/kbart_LOCKSS.txt'
+CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt'
+PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
+JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt'
+SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
+SZCZEPANSKI_DATE = '2018'
+SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json'
+EZB_FILE = 'data/ezb_metadata.json'
+GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv'
+WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv'
+OPENAPC_FILE = 'data/apc_de.2019-12-20.csv'
+FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json'
+
+IA_CRAWL_FILE = 'data/url_status.20191223.json'
+FATCAT_STATS_FILE = 'data/container_stats.20191213.json'
diff --git a/chocula/util.py b/chocula/util.py
new file mode 100644
index 0000000..533b41a
--- /dev/null
+++ b/chocula/util.py
@@ -0,0 +1,311 @@
+
+import urlcanon
+import surt
+import tldextract
+import pycountry
+
+################### Utilities
+
+# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
+# software frameworks
+PLATFORM_MAP = {
+    'OJS': 'ojs',
+    'BMC': 'bmc',
+    'SciELO Brazil': 'scielo',
+    'SciELO Argentina': 'scielo',
+    'SciELO': 'scielo',
+    'SciELO Mexico': 'scielo',
+    'SciELO Spain': 'scielo',
+    'SciELO Portugal': 'scielo',
+    'WordPress': 'wordpress',
+    'Sciendo': 'sciendo',
+    'Drupal': 'drupal',
+    'revues.org': 'openedition',
+}
+
+MIMETYPE_MAP = {
+    'PDF': 'application/pdf',
+    'HTML': 'text/html',
+    'XML': 'application/xml',
+}
+
+BIG5_PUBLISHERS = [
+    'Elsevier',
+    'Informa UK (Taylor & Francis)',
+    'Springer-Verlag',
+    'SAGE Publications',
+    'Wiley (Blackwell Publishing)',
+    'Wiley (John Wiley & Sons)',
+    'Springer (Biomed Central Ltd.)',
+    'Springer Nature',
+]
+COMMERCIAL_PUBLISHERS = [
+    'Peter Lang International Academic Publishers',
+    'Walter de Gruyter GmbH',
+    'Oldenbourg Wissenschaftsverlag',
+    'Georg Thieme Verlag KG', # not springer
+    'Emerald (MCB UP )',
+    'Medknow Publications',
+    'Inderscience Enterprises Ltd',
+    'Bentham Science',
+    'Ovid Technologies (Wolters Kluwer)  - Lippincott Williams & Wilkins',
+    'Scientific Research Publishing, Inc',
+    'MDPI AG',
+    'S. Karger AG',
+    'Pleiades Publishing',
+    'Science Publishing Group',
+    'IGI Global',
+    'The Economist Intelligence Unit',
+    'Maney Publishing',
+    'Diva Enterprises Private Limited',
+    'World Scientific',
+    'Mary Ann Liebert',
+    'Trans Tech Publications',
+]
+OA_PUBLISHERS = [
+    'Hindawi Limited',
+    'OMICS Publishing Group',
+    'De Gruyter Open Sp. z o.o.',
+    'OpenEdition',
+    'Hindawi (International Scholarly Research Network)',
+    'Public Library of Science',
+    'Frontiers Media SA',
+    'eLife Sciences Publications, Ltd',
+    'MDPI AG',
+    'Hindawi (International Scholarly Research Network)',
+    'Dove Medical Press',
+    'Open Access Text',
+]
+SOCIETY_PUBLISHERS = [
+    'Institute of Electrical and Electronics Engineers',
+    'Institution of Electrical Engineers',
+    'Association for Computing Machinery',
+    'American Psychological Association',
+    'IOS Press',
+    'IOP Publishing',
+    'American Chemical Society',
+    'Royal Society of Chemistry (RSC)',
+    'American Geophysical Union',
+    'American College of Physicians',
+    'New England Journal of Medicine',
+    'BMJ',
+    'RCN Publishing',
+    'International Union of Crystallography',
+    'Portland Press',
+    'ASME International',
+]
+UNI_PRESS_PUBLISHERS = [
+    'Cambridge University Press',
+    'Oxford University Press',
+    'The University of Chicago Press',
+    'MIT Press',
+]
+ARCHIVE_PUBLISHERS = [
+    'JSTOR',
+    'Portico',
+]
+REPOSITORY_PUBLISHERS = [
+    'PERSEE Program',
+    'Social Science Electronic Publishing',
+    'CAIRN',
+    'CSIRO Publishing',
+]
+OTHER_PUBLISHERS = [
+    'African Journals Online',
+    'Smithsonian Institution Biodiversity Heritage Library',
+    'Canadian Science Publishing',
+    'Philosophy Documentation Center',
+    'Project MUSE',
+]
+
+def unquote(s):
+    if s.startswith('"'):
+        s = s[1:]
+    if s.endswith('"'):
+        s = s[:-1]
+    if s.endswith('.'):
+        s = s[:-1]
+    return s.strip()
+
+def parse_lang(s):
+    if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
+        return None
+    try:
+        if len(s) == 2:
+            lang = pycountry.languages.get(alpha2=s.lower())
+        elif len(s) == 3:
+            lang = pycountry.languages.get(alpha3=s.lower())
+        else:
+            lang = pycountry.languages.get(name=s)
+        return lang.alpha2.lower()
+    except KeyError:
+        return None
+    except AttributeError:
+        return None
+
+def parse_country(s):
+    if not s or s in ('Unknown'):
+        return None
+    try:
+        if len(s) == 2:
+            country = pycountry.countries.get(alpha2=s.lower())
+        else:
+            country = pycountry.countries.get(name=s)
+    except KeyError:
+        return None
+    if country:
+        return country.alpha_2.lower()
+    else:
+        return None
+
+def parse_mimetypes(val):
+    # XXX: multiple mimetypes?
+    if not val:
+        return
+    mimetype = None
+    if '/' in val:
+        mimetype = val
+    else:
+        mimetype = MIMETYPE_MAP.get(val)
+    if not mimetype:
+        return None
+    return [mimetype]
+
+def gaps_to_spans(first, last, gaps):
+    if not gaps:
+        return [[first, last]]
+    if not (last >= first and max(gaps) < last and min(gaps) > first):
+        # mangled
+        print("mangled years: {}".format((first, last, gaps)))
+        return []
+    full = list(range(first, last+1))
+    for missing in gaps:
+        full.remove(missing)
+    spans = []
+    low = None
+    last = None
+    for year in full:
+        if not low:
+            low = year
+            last = year
+            continue
+        if year != last+1:
+            spans.append([low, last])
+            low = year
+            last = year
+        last = year
+    if low:
+        spans.append([low, last])
+    return spans
+
+def test_gaps():
+    assert gaps_to_spans(1900, 1900, None) == \
+        [[1900, 1900]]
+    assert gaps_to_spans(1900, 1903, None) == \
+        [[1900, 1903]]
+    assert gaps_to_spans(1900, 1902, [1901]) == \
+        [[1900, 1900], [1902, 1902]]
+    assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
+        [[1950, 1954], [1957, 1964], [1966, 1970]]
+
+def merge_spans(old, new):
+    if not new:
+        return old
+    if not old:
+        old = []
+    old.extend(new)
+    years = set()
+    for span in old:
+        for y in range(span[0], span[1]+1):
+            years.add(y)
+    if not years:
+        return []
+    spans = []
+    start = None
+    last = None
+    todo = False
+    for y in sorted(list(years)):
+        if start == None:
+            # very first
+            start = y
+            last = y
+            todo = True
+            continue
+        if y == last + 1:
+            # span continues
+            last = y
+            todo = True
+            continue
+        # a gap just happened!
+        spans.append([start, last])
+        start = y
+        last = y
+        todo = True
+    if todo:
+        spans.append([start, last])
+    return spans
+
+def test_merge_spans():
+    assert merge_spans([[5, 10]], [[10, 20]]) == \
+        [[5, 20]]
+    assert merge_spans([[5, 9]], [[10, 20]]) == \
+        [[5, 20]]
+    assert merge_spans([[5, 11]], [[10, 20]]) == \
+        [[5, 20]]
+    assert merge_spans([], []) == \
+        []
+    assert merge_spans([[9, 11]], []) == \
+        [[9,11]]
+    assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
+        [[1450, 1900], [2000, 2000]]
+
+
+def parse_url(url):
+    """
+    Parses/cleans URLs.
+
+    Returns a dict with:
+        
+        url: str, cleaned/normalized URL
+        url_surt: str, "sortable url" (a web-archiving format)
+        host: str, full hostname
+        registered_domain: "primary domain", eg "google.com" or "thing.co.uk"
+        suffix: str, eg "com" or "co.uk"
+
+    Returns None if url is really bad (not a URL).
+    """
+    if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+        return None
+    if url.startswith('www.'):
+        url = "http://" + url
+    if url.startswith('ttp://') or url.startswith('ttps://'):
+        url = "h" + url
+    url.replace('Http://', 'http://')
+
+    url = str(urlcanon.semantic_precise(url))
+    if url == 'http://na/':
+        # sort of redundant with above, but some only match after canonicalization
+        return None
+    url_surt = surt.surt(url)
+    tld = tldextract.extract(url)
+    host = '.'.join(tld)
+    if host.startswith('.'):
+        host = host[1:]
+    return dict(url=url,
+                url_surt=url_surt or None,
+                host=host or None,
+                registered_domain=tld.registered_domain or None,
+                suffix=tld.suffix or None)
+
+def test_parse_url():
+    
+    assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk'
+    assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
+    assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
+
+    assert parse_url("google.com")['suffix'] == 'com'
+    assert parse_url("google.com")['host'] == 'google.com'
+
+    assert parse_url("mailto:bnewbold@bogus.com") == None
+    assert parse_url("thing.com")['url'] == 'http://thing.com/'
+    assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'
diff --git a/chocula_tool.py b/chocula_tool.py
new file mode 100755
index 0000000..345097e
--- /dev/null
+++ b/chocula_tool.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+"""
+Count Chocula - online serials metadata and stats
+
+  "one, two, three, un-preserved web-native open-access long-tail indie
+  journals, hah, hah, hah!"
+
+  (yeah, I know, this name isn't very good)
+  (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)
+
+Commands:
+
+    everything
+    init_db
+    summarize
+    export
+    export_fatcat
+
+    index_doaj
+    index_road
+    index_crossref
+    index_entrez
+    index_norwegian
+    index_szczepanski
+    index_ezb
+    index_wikidata
+    index_openapc
+
+    load_fatcat
+    load_fatcat_stats
+
+    export_urls
+    update_url_status
+
+Future commands:
+
+    index_jurn
+    index_datacite
+    preserve_kbart --keeper SLUG
+    preserve_sim
+
+See TODO.md for more work-in-progress
+"""
+
+import sys
+import csv
+import argparse
+
+from chocula import ChoculaDatabase
+from chocula.config import *
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    subparsers = parser.add_subparsers()
+
+    parser.add_argument("--db-file",
+        help="run in mode that considers only terminal HTML success",
+        default='chocula.sqlite',
+        type=str)
+    parser.add_argument("--input-file",
+        help="override default input file path",
+        default=None,
+        type=str)
+
+    sub = subparsers.add_parser('everything',
+        help="run all the commands")
+    sub.set_defaults(func='everything')
+
+    sub = subparsers.add_parser('init_db',
+        help="create sqlite3 output file and tables")
+    sub.set_defaults(func='init_db')
+
+    sub = subparsers.add_parser('summarize',
+        help="aggregate metadata from all tables into 'journals' table")
+    sub.set_defaults(func='summarize')
+
+    sub = subparsers.add_parser('export',
+        help="dump JSON output")
+    sub.set_defaults(func='export')
+
+    sub = subparsers.add_parser('export_fatcat',
+        help="dump JSON output in a format that can load into fatcat")
+    sub.set_defaults(func='export_fatcat')
+
+    # TODO: 'jurn'
+    for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'):
+        sub = subparsers.add_parser('index_{}'.format(ind),
+            help="load metadata from {}".format(ind))
+        sub.set_defaults(func='index_{}'.format(ind))
+
+    sub = subparsers.add_parser('load_fatcat',
+        help="load fatcat container metadata")
+    sub.set_defaults(func='load_fatcat')
+
+    sub = subparsers.add_parser('load_fatcat_stats',
+        help="update container-level stats from JSON file")
+    sub.set_defaults(func='load_fatcat_stats')
+
+    sub = subparsers.add_parser('export_urls',
+        help="dump homepage URLs (eg, to crawl for status)")
+    sub.set_defaults(func='export_urls')
+
+    sub = subparsers.add_parser('update_url_status',
+        help="import homepage URL crawl status")
+    sub.set_defaults(func='update_url_status')
+
+    args = parser.parse_args()
+    if not args.__dict__.get("func"):
+        print("tell me what to do! (try --help)")
+        sys.exit(-1)
+
+    cdb = ChoculaDatabase(args.db_file)
+    if args.func.startswith('index_') or args.func in ('everything','summarize',):
+        cdb.read_issn_map_file(ISSNL_FILE)
+    func = getattr(cdb, args.func)
+    func(args)
+
+if __name__ == '__main__':
+    main()
+
diff --git a/count_chocola.jpg b/extra/count_chocola.jpg
index e9da539..e9da539 100644
--- a/count_chocola.jpg
+++ b/extra/count_chocola.jpg
diff --git a/wikidata.sparql b/extra/wikidata/wikidata.sparql
index 3f7e2f9..3f7e2f9 100644
--- a/wikidata.sparql
+++ b/extra/wikidata/wikidata.sparql