From 57db2db336c08031324e44b2d2880fbd4b6893c9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Mon, 1 Jun 2020 17:01:20 -0700
Subject: 'everything' at least partially working

---
 Makefile                        |  59 ++++++++-------
 TODO.md                         |  48 ++++++++++---
 chocula/__init__.py             |  12 +---
 chocula/__main__.py             | 155 ++++++++++++++++++++++++++++++++++++++++
 chocula/common.py               |  27 +++++++
 chocula/database.py             | 142 +++++++++---------------------------
 chocula/directories/__init__.py |  19 +++++
 chocula/directories/sim.py      |  71 ++++++++++++++++++
 chocula_tool.py                 | 123 -------------------------------
 sources.toml                    |   8 ++-
 tests/test_directories.py       |   4 +-
 11 files changed, 386 insertions(+), 282 deletions(-)
 create mode 100755 chocula/__main__.py
 create mode 100644 chocula/directories/__init__.py
 create mode 100644 chocula/directories/sim.py
 delete mode 100755 chocula_tool.py

diff --git a/Makefile b/Makefile
index 53e0c6d..977c80e 100644
--- a/Makefile
+++ b/Makefile
@@ -4,39 +4,37 @@ SNAPSHOTITEM := $(shell grep ia_item sources.toml | cut -f2 -d'"')
 
 .PHONY: help
 help: ## Print info about all commands
-	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+	@echo "Commands:"
+	@echo
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "    \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
 
 .PHONY: test
 test: ## Run all tests and lints
 	pipenv run pytest
 	pipenv run mypy *.py chocula/*.py chocula/*/*.py --ignore-missing-imports
 
-#.PHONY: database
-#database: ## Build database from sources
-#	@if [ ! -f data/ISSN-to-ISSN-L.txt ]; then echo "You must run 'make fetch-sources' first"; exit -1; fi
-#	pipenv run ./chocula_tool.py everything
-
-#data/container_stats.json:
-#	cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv
-#	cat /tmp/container_issnl.tsv | parallel -j10 curl -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . > /tmp/container_stats.json
-#	mv /tmp/container_stats.json data
+data/container_stats.json:
+	mkdir -p data
+	cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv
+	cat /tmp/container_issnl.tsv | parallel -j10 curl -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . > /tmp/container_stats.json
+	mv /tmp/container_stats.json data
 
-#.PHONY: container-stats
-#container-stats: data/container_stats.json
-#	wc -l data/container_stats.json
-#	@echo
-#	@echo Done
+.PHONY: container-stats
+container-stats: data/container_stats.json
+	wc -l data/container_stats.json
+	@echo
+	@echo Done
 
-#data/homepage_status.json:
-#	pipenv run ./chocula.py export_urls | shuf > /tmp/chocula_urls_to_crawl.tsv
-#	pipenv run parallel -j10 --bar --pipepart -a /tmp/chocula_urls_to_crawl.shuf.tsv ./check_issn_urls.py > /tmp/homepage_status.json
-#	cp /tmp/homepage_status.json data/
+data/homepage_status.json:
+	pipenv run ./chocula.py export_urls | shuf > /tmp/chocula_urls_to_crawl.tsv
+	pipenv run parallel -j10 --bar --pipepart -a /tmp/chocula_urls_to_crawl.shuf.tsv ./check_issn_urls.py > /tmp/homepage_status.json
+	cp /tmp/homepage_status.json data/
 
-#.PHONY: homepage-status
-#homepage-status: data/homepage_status.json
-#	wc -l data/homepage-status.json
-#	@echo
-#	@echo Done
+.PHONY: homepage-status
+homepage-status: data/homepage_status.json
+	wc -l data/homepage-status.json
+	@echo
+	@echo Done
 
 .PHONY: fetch-sources
 fetch-sources: ## Download existing snapshot versions of all sources from archive.org
@@ -45,6 +43,7 @@ fetch-sources: ## Download existing snapshot versions of all sources from archiv
 
 .PHONY: update-sources
 update-sources: ## Download new versions of updatable sources
+	@# TODO: refactor to be individual targets-per-file (see fatcat-covid19 example)
 	mkdir -p data/$(TODAY)
 	wget -c "https://www.issn.org/wp-content/uploads/2014/03/issnltables.zip" -O /tmp/issnltables.$(TODAY).zip
 	unzip -p /tmp/issnltables.$(TODAY).zip "*.ISSN-to-ISSN-L.txt" > /tmp/ISSN-to-ISSN-L.$(TODAY).txt
@@ -58,11 +57,17 @@ update-sources: ## Download new versions of updatable sources
 	@echo
 	@echo "Successfully updated for date (UTC): $(TODAY)"
 
-#.PHONY: upload-sources
-#upload-sources: ## Upload an updated snapshot of sources to archive.org
-#	ia upload --checksum chocula-sources-$(TODAY) data/*.tsv data/*.csv data/*.json data/*.txt
+.PHONY: upload-sources
+upload-sources: update-sources ## Upload most recent update-able sources to a new IA item
+	ia upload --checksum chocula-sources-snapshot-$(TODAY) data/$(TODAY)/*
+	# TODO: ia upload --checksum chocula-sources-$(TODAY) data/*.tsv data/*.csv data/*.json data/*.txt
 
 #.PHONY: upload-snapshot
 #upload-snapshot: ## Upload an sqlite snapshot to archive.org
 #	ia upload --checksum --no-derive chocula-snapshot-$(TODAY) chocula.sqlite3 README.md extra/count_chocula.jpg
 
+
+.PHONY: database
+database: ## Build database from sources
+	@if [ ! -f data/ISSN-to-ISSN-L.txt ]; then echo "You must run 'make fetch-sources' first"; exit -1; fi
+	pipenv run python -m chocula everything
diff --git a/TODO.md b/TODO.md
index 2d0c7e3..a6814a0 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,33 +1,66 @@
 
+2020-05-06
+x python3.7
+x type annotations / dataclasses
+x "update-sources"
+    => makefile
+- run "everything" successfully
+- "upload-sources"
+    => to archive.org, with datetime
+- "fetch-sources"
+    => all snapshots in a single ia item, with datetime
+- scielo journal metadata
+- kbart loading
+- "platform" column in database
+- rewrite README
+
+- flag to delete old table/rows when loading (?)
+- "loaders" not directories?
+- makefile
+- black
+- refactor most code into module directory
+- tests
+    => index process
+- update upstreams
+
+refactors:
+- "directory" command with directory as arg
+- "kbart" command with directory as arg
+- "load" command with directory as arg
+
+https://isaw.nyu.edu/publications/awol-index/
+
 ## Chocula
 
+- fully automated updates, luigi/gluish style
+    => downloads/uploads source metadata files
+    => outputs config file for chocula run
+    => runs chocula everything
+
 priorities:
-x fraction/which are pointing to wayback
 - coverage stats, particularly for longtail
-x wikidata linkage (prep for wikimania)
 - "still in print" flag
 - clean out invalid ISSN-L from fatcat
 - don't list dead URLs in fatcat
 - summary report of some of above
-- update all fatcat (wikidata QID, urls, fixed ISSN-L, etc)
 - when updating fatcat:
     if title is "blah,  Proceedings of the", set type to proceedings and re-write title
     if title like "Workshop on", set type
 
 source improvements:
 - entrez: "NLM Unique Id"
-- JUFO: finish 
+- JURN: finish 
 - crossref: empty string identifiers?
+- scielo: https://scielo.org/en/journals/list-by-alphabetical-order/?export=csv
+- https://www.arc.gov.au/excellence-research-australia (journal list)
 
 - public scopus list (?)
 - scrape/munge public clarivate dumps
 - import JURN into fatcat (one way or another)
     => try to title match and get ISSN-L
     => manual lookups for remainders?
-- dump json
 - "GOLD" importer (for scopus/WoS)
 - check that all fields actually getting imported reasonably
-- homepage crawl/status script
 
 - could poll portal.issn.org like:
     https://portal.issn.org/resource/ISSN/1561-7645?format=json
@@ -40,7 +73,4 @@ source improvements:
 - update_url_status (needs re-write)
 - log out index issues (duplicate ISSN-L, etc) to a file
 - validate against GOLD OA list
-- decide what to do with JURN... match? fuzzy match? create missing fatcat?
-- lots of bogus ISSN-L, like 9999-9999 or 0000-0000. should both validate
-  check digit and require an ISSN-L to actually exist.
 
diff --git a/chocula/__init__.py b/chocula/__init__.py
index a0947e1..440e7a5 100644
--- a/chocula/__init__.py
+++ b/chocula/__init__.py
@@ -1,15 +1,5 @@
 
 from chocula.config import ChoculaConfig
+from chocula.directories import *
 from chocula.database import ChoculaDatabase, IssnDatabase
 
-from chocula.directories.crossref import CrossrefLoader
-from chocula.directories.doaj import DoajLoader
-from chocula.directories.entrez import EntrezLoader
-from chocula.directories.ezb import EzbLoader
-from chocula.directories.gold_oa import GoldOALoader
-from chocula.directories.norwegian import NorwegianLoader
-from chocula.directories.openapc import OpenAPCLoader
-from chocula.directories.road import RoadLoader
-from chocula.directories.sherpa_romeo import SherpaRomeoLoader
-from chocula.directories.szczepanski import SzczepanskiLoader
-from chocula.directories.wikidata import WikidataLoader
diff --git a/chocula/__main__.py b/chocula/__main__.py
new file mode 100755
index 0000000..21f3976
--- /dev/null
+++ b/chocula/__main__.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+"""
+Count Chocula - online serials metadata and stats
+
+  "one, two, three, un-preserved web-native open-access long-tail indie
+  journals, hah, hah, hah!"
+
+  (yeah, I know, this name isn't very good)
+  (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)
+
+Commands:
+
+    everything
+    init_db
+    summarize
+    export
+    export_fatcat
+
+    index_doaj
+    index_road
+    index_crossref
+    index_entrez
+    index_norwegian
+    index_szczepanski
+    index_ezb
+    index_wikidata
+    index_openapc
+    index_sim
+
+    load_fatcat_containers
+    load_fatcat_stats
+    load_homepage_status
+
+    export_urls
+
+Future commands:
+
+    index_jurn
+    index_datacite
+    preserve_kbart --keeper SLUG
+    preserve_sim
+
+See TODO.md for more work-in-progress
+"""
+
+import sys
+import csv
+import argparse
+
+from chocula import ChoculaDatabase, ChoculaConfig, IssnDatabase, ALL_CHOCULA_DIR_CLASSES
+
+
+def run_everything(config, database):
+
+    database.init_db()
+    for cls in ALL_CHOCULA_DIR_CLASSES:
+        loader = cls(config)
+        counts = loader.index_file(database)
+        print(counts)
+
+    # XXX: TODO:
+    database.load_fatcat_containers(config)
+    database.load_fatcat_stats(config)
+    # XXX: TODO:
+    #self.preserve_kbart('lockss', LOCKSS_FILE)
+    #self.preserve_kbart('clockss', CLOCKSS_FILE)
+    #self.preserve_kbart('portico', PORTICO_FILE)
+    #self.preserve_kbart('jstor', JSTOR_FILE)
+    #self.preserve_sim(args)
+    database.load_homepage_status(config)
+    database.summarize()
+    print("### Done with everything!")
+
+def run_index(config, database, cls):
+    loader = cls(config)
+    counts = loader.index_file(database)
+    print(counts)
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    subparsers = parser.add_subparsers()
+
+    parser.add_argument("--db-file",
+        help="sqlite database file",
+        default='chocula.sqlite',
+        type=str)
+
+    sub = subparsers.add_parser('everything',
+        help="run all the commands")
+    sub.set_defaults(func='everything')
+
+    sub = subparsers.add_parser('init_db',
+        help="create sqlite3 output file and tables")
+    sub.set_defaults(func='init_db')
+
+    sub = subparsers.add_parser('summarize',
+        help="aggregate metadata from all tables into 'journals' table")
+    sub.set_defaults(func='summarize')
+
+    sub = subparsers.add_parser('export',
+        help="dump JSON output")
+    sub.set_defaults(func='export')
+
+    sub = subparsers.add_parser('export_fatcat',
+        help="dump JSON output in a format that can load into fatcat")
+    sub.set_defaults(func='export_fatcat')
+
+    for cls in ALL_CHOCULA_DIR_CLASSES:
+        sub = subparsers.add_parser('index_{}'.format(cls.source_slug),
+            help="load metadata from {}".format(cls.source_slug))
+        sub.set_defaults(func='index_{}'.format(cls.source_slug), index_cls=cls)
+
+    sub = subparsers.add_parser('load_fatcat_containers',
+        help="load fatcat container metadata")
+    sub.set_defaults(func='load_fatcat_containers')
+
+    sub = subparsers.add_parser('load_fatcat_stats',
+        help="update container-level stats from JSON file")
+    sub.set_defaults(func='load_fatcat_stats')
+
+    sub = subparsers.add_parser('export_urls',
+        help="dump homepage URLs (eg, to crawl for status)")
+    sub.set_defaults(func='export_urls')
+
+    sub = subparsers.add_parser('load_homepage_status',
+        help="import homepage URL crawl status")
+    sub.set_defaults(func='load_homepage_status')
+
+    args = parser.parse_args()
+    if not args.__dict__.get("func"):
+        print("tell me what to do! (try --help)")
+        sys.exit(-1)
+
+    config = ChoculaConfig.from_file()
+    if args.func.startswith('index_') or args.func in ('everything','summarize',):
+        issn_db = IssnDatabase(config.issnl.filepath)
+    else:
+        issn_db = None
+    cdb = ChoculaDatabase(args.db_file, issn_db)
+    if args.func == 'everything':
+        run_everything(config, cdb)
+    elif args.func.startswith('index_'):
+        print(run_index(config, cdb, args.index_cls))
+    elif args.func.startswith('load_'):
+        func = getattr(cdb, args.func)
+        print(func(config))
+    else:
+        func = getattr(cdb, args.func)
+        print(func(), file=sys.stderr)
+
+if __name__ == '__main__':
+    main()
+
diff --git a/chocula/common.py b/chocula/common.py
index 54856c9..f515e6f 100644
--- a/chocula/common.py
+++ b/chocula/common.py
@@ -33,3 +33,30 @@ class DirectoryLoader():
         cur.close()
         db.db.commit()
         return counts
+
+class KbartLoader():
+
+    source_slug: str = "GENERIC"
+
+    def __init__(self, config: ChoculaConfig):
+        self.config = config
+
+    def open_file(self) -> Iterable:
+        raise NotImplementedError()
+
+    def parse_record(self, record) -> Optional[DirectoryInfo]:
+        raise NotImplementedError()
+
+    def index_file(self, db) -> Counter:
+        print(f"##### Loading {self.source_slug} KBART...", file=sys.stderr)
+        counts: Counter = Counter()
+        cur = db.db.cursor()
+        for record in self.open_file():
+            counts['total'] += 1
+            info = self.parse_record(record)
+            if info:
+                status = db.insert_directory(info, cur=cur)
+                counts[status] += 1
+        cur.close()
+        db.db.commit()
+        return counts
diff --git a/chocula/database.py b/chocula/database.py
index f6a000a..3efa725 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -15,6 +15,7 @@ import tldextract
 import ftfy
 import stdnum.issn
 
+from chocula import *
 from chocula.util import *
 
 
@@ -271,7 +272,7 @@ class ChoculaDatabase():
 
         return "inserted"
 
-    def parse_kbart(self, name, path):
+    def parse_kbart(self, name, path) -> Counter:
         """
         Transforms a KBART file into a dict of dicts; but basically a list of
         JSON objects, one per journal. KBART files can have multiple rows per
@@ -318,69 +319,13 @@ class ChoculaDatabase():
                 else:
                     new_spans = [[start, end]]
                 d['year_spans'] = merge_spans(old_spans, new_spans)
-        print(counts)
-        return kbart_dict
+        return counts
 
-
-    def index_sim(self, args):
-        path = args.input_file or SIM_FILE
-        print("##### Loading SIM Metadata...")
-        #NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer-\n Reviewed","Peer-\n Reviewed",Pub Type,Pub Language,Subjects
-        reader = csv.DictReader(open(path))
-        counts = Counter()
-        cur = self.db.cursor()
-        for row in reader:
-            if not row['ISSN'] or row['ISSN'] == "NULL":
-                counts['no-issn'] += 1
-                continue
-            issnl, status = self.add_issn(
-                'ia_sim',
-                raw_issn=row['ISSN'][:9],
-                name=row['Title'],
-                publisher=row['Publisher'],
-                extra=extra,
-            )
-            counts[status] += 1
-            if not issnl:
-                continue
-            d = self.data[issnl]
-            sim = dict()
-            sim['id'] = row['NA Pub Cat ID']
-            first_year = row['First Volume']
-            if first_year:
-                first_year = int(first_year)
-                sim['first_year'] = int(row['First Volume'])
-            else:
-                first_year = None
-            last_year = row['Last Volume']
-            if last_year:
-                last_year = int(last_year)
-                sim['last_year'] = last_year
-            else:
-                last_year = None
-            gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
-            if gaps:
-                sim['gaps'] = gaps
-            if first_year and last_year:
-                sim['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
-            if row['Pub Language']:
-                self.add_lang(issnl, row['Pub Language'])
-            # TODO: 'Pub Type'
-            all_keys = list(sim.keys())
-            for k in all_keys:
-                if not sim[k]:
-                    sim.pop(k)
-            self.data[issnl]['sim'] = sim
-        cur.close()
-        self.db.commit()
-        print(counts)
-
-    def update_url_status(self, args):
-        path = args.input_file or IA_CRAWL_FILE
+    def load_homepage_status(self, config: ChoculaConfig) -> Counter:
         print("##### Loading IA Homepage Crawl Results...")
         counts = Counter()
         cur = self.db.cursor()
-        for row in open(path, 'r'):
+        for row in open(config.homepage_status.filepath, 'r'):
             if not row.strip():
                 continue
             row = json.loads(row)
@@ -405,13 +350,12 @@ class ChoculaDatabase():
             counts['updated'] += 1
         cur.close()
         self.db.commit()
-        print(counts)
+        return counts
 
-    def load_fatcat(self, args):
-        path = args.input_file or FATCAT_CONTAINER_FILE
+    def load_fatcat_containers(self, config: ChoculaConfig) -> Counter:
         print("##### Loading Fatcat Container Entities...")
         # JSON
-        json_file = open(path, 'r')
+        json_file = open(config.fatcat_containers.filepath, 'r')
         counts = Counter()
         cur = self.db.cursor()
         for row in json_file:
@@ -445,22 +389,25 @@ class ChoculaDatabase():
                     ))
             except sqlite3.IntegrityError as ie:
                 if str(ie).startswith("UNIQUE"):
-                    return None, "duplicate-issnl"
-                raise ie
+                    counts["existing"] += 1
+                    continue
+                else:
+                    raise ie
             counts['inserted'] += 1
             if row.get('issnl'):
                 urls = extra.get('urls', [])
                 for url in urls:
-                    self.add_url(row['issnl'], url)
+                    homepage = HomepageUrl.from_url(url)
+                    if homepage:
+                        self.insert_homepage(row.get('issnl'), homepage, cur)
         cur.close()
         self.db.commit()
-        print(counts)
+        return counts
 
-    def load_fatcat_stats(self, args):
-        path = args.input_file or FATCAT_STATS_FILE
+    def load_fatcat_stats(self, config: ChoculaConfig) -> Counter:
         print("##### Loading Fatcat Container Stats...")
         # JSON
-        json_file = open(path, 'r')
+        json_file = open(config.fatcat_stats.filepath, 'r')
         counts = Counter()
         cur = self.db.cursor()
         for row in json_file:
@@ -479,18 +426,21 @@ class ChoculaDatabase():
             counts['updated'] += 1
         cur.close()
         self.db.commit()
-        print(counts)
+        return counts
 
-    def export_urls(self, args):
+    def export_urls(self) -> Counter:
+        counts = Counter()
         cur = self.db.cursor()
         self.db.row_factory = sqlite3.Row
         cur = self.db.execute("SELECT issnl, url FROM homepage;")
         for hrow in cur:
             assert(hrow['url'])
             assert(len(hrow['url'].split()) == 1)
+            counts['total'] += 1
             print('\t'.join((hrow['issnl'], hrow['url'])))
+        return counts
 
-    def summarize(self, args):
+    def summarize(self) -> Counter:
         print("##### Summarizing Everything...")
         counts = Counter()
         cur = self.db.cursor()
@@ -506,7 +456,7 @@ class ChoculaDatabase():
             out = dict()
 
             # check if ISSN-L is good. this is here because of fatcat import
-            out['known_issnl'] = (self.issn2issnl(issnl) == issnl)
+            out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl)
             if not out['known_issnl']:
                 counts['unknown-issnl'] += 1
             out['valid_issnl'] = stdnum.issn.is_valid(issnl)
@@ -544,8 +494,8 @@ class ChoculaDatabase():
                         out['is_oa'] = True
                 if irow['slug'] == 'sherpa_romeo':
                     extra = json.loads(irow['extra'])
-                    out['sherpa_color'] = extra['color']
-                    if extra['color'] == 'green':
+                    out['sherpa_color'] = extra['sherpa_romeo']['color']
+                    if extra['sherpa_romeo']['color'] == 'green':
                         out['is_oa'] = True
 
             # filter out "NA" ISSNs
@@ -624,33 +574,9 @@ class ChoculaDatabase():
                 ))
         cur.close()
         self.db.commit()
-        print(counts)
-
-    def everything(self, args):
-        self.init_db(args)
-        self.index_doaj(args)
-        self.index_norwegian(args)
-        self.index_crossref(args)
-        self.index_sherpa_romeo(args)
-        self.index_road(args)
-        self.index_entrez(args)
-        self.index_ezb(args)
-        self.index_szczepanski(args)
-        self.index_gold_oa(args)
-        self.index_openapc(args)
-        self.index_wikidata(args)
-        self.load_fatcat(args)
-        self.load_fatcat_stats(args)
-        #self.preserve_kbart('lockss', LOCKSS_FILE)
-        #self.preserve_kbart('clockss', CLOCKSS_FILE)
-        #self.preserve_kbart('portico', PORTICO_FILE)
-        #self.preserve_kbart('jstor', JSTOR_FILE)
-        #self.preserve_sim(args)
-        self.update_url_status(args)
-        self.summarize(args)
-        print("### Done with everything!")
-
-    def export(self, args):
+        return counts
+
+    def export(self) -> Counter:
         def dict_factory(cursor, row):
             d = {}
             for idx, col in enumerate(cursor.description):
@@ -662,8 +588,9 @@ class ChoculaDatabase():
         for row in cur.execute('SELECT * FROM journal'):
             print(json.dumps(row))
             counts['total'] += 1
+        return counts
 
-    def export_fatcat(self, args):
+    def export_fatcat(self):
         counts = Counter()
         self.db.row_factory = sqlite3.Row
         cur = self.db.cursor()
@@ -748,13 +675,14 @@ class ChoculaDatabase():
                     ezb = json.loads(drow['extra'])
                     extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color'])
                 if drow['slug'] == 'szczepanski':
-                    # XXX: pull from record
-                    extra['szczepanski'] = dict(as_of=config.szczepanski.date)
+                    # TODO: what to put here?
+                    extra['szczepanski'] = drow['extra']
                 if drow['slug'] == 'doaj':
                     extra['doaj'] = json.loads(drow['extra'])
 
             out['extra'] = extra
             print(json.dumps(out))
+        return counts
 
     def init_db(self):
         print("### Creating Database...", file=sys.stderr)
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py
new file mode 100644
index 0000000..4bed696
--- /dev/null
+++ b/chocula/directories/__init__.py
@@ -0,0 +1,19 @@
+
+from chocula.directories.crossref import CrossrefLoader
+from chocula.directories.doaj import DoajLoader
+from chocula.directories.entrez import EntrezLoader
+from chocula.directories.ezb import EzbLoader
+from chocula.directories.gold_oa import GoldOALoader
+from chocula.directories.norwegian import NorwegianLoader
+from chocula.directories.openapc import OpenAPCLoader
+from chocula.directories.road import RoadLoader
+from chocula.directories.sherpa_romeo import SherpaRomeoLoader
+from chocula.directories.sim import SimLoader
+from chocula.directories.szczepanski import SzczepanskiLoader
+from chocula.directories.wikidata import WikidataLoader
+
+ALL_CHOCULA_DIR_CLASSES = [
+    CrossrefLoader, DoajLoader, EntrezLoader,EzbLoader, GoldOALoader,
+    NorwegianLoader, OpenAPCLoader, RoadLoader, SherpaRomeoLoader,
+    SzczepanskiLoader, WikidataLoader, SimLoader,
+]
diff --git a/chocula/directories/sim.py b/chocula/directories/sim.py
new file mode 100644
index 0000000..c0c02df
--- /dev/null
+++ b/chocula/directories/sim.py
@@ -0,0 +1,71 @@
+
+from typing import Iterable, Optional, Dict, Any
+import csv
+
+from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP, gaps_to_spans
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class SimLoader(DirectoryLoader):
+
+    source_slug = "sim"
+
+    def open_file(self) -> Iterable:
+        return csv.DictReader(open(self.config.sim.filepath))
+
+    def parse_record(self, row) -> Optional[DirectoryInfo]:
+
+        """
+        NA Pub Cat ID
+        Title
+        Publisher
+        ISSN
+        Impact Rank
+        Total Cities
+        Journal Impact Factor
+        Eigenfact or Score
+        First Volume
+        Last Volume
+        NA Gaps
+        "Scholarly / Peer-\n Reviewed"
+        "Peer-\n Reviewed"
+        Pub Type
+        Pub Language
+        Subjects
+        """
+        # TODO: 'Pub Type'
+
+        extra = {}
+        first_year = row['First Volume']
+        if first_year:
+            first_year = int(first_year)
+            extra['first_year'] = int(row['First Volume'])
+        else:
+            first_year = None
+        last_year = row['Last Volume']
+        if last_year:
+            last_year = int(last_year)
+            extra['last_year'] = last_year
+        else:
+            last_year = None
+        gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
+        if gaps:
+            extra['gaps'] = gaps
+        if first_year and last_year:
+            extra['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
+        extra['scholarly_peer_reviewed'] = row["Scholarly / Peer-\nReviewed"]
+        extra['peer_reviewed'] = row["Peer-\nReviewed"]
+        extra['pub_type'] = clean_str(row["Pub Type"])
+
+        info = DirectoryInfo(
+            directory_slug=self.source_slug,
+            name=clean_str(row['Title']),
+            publisher=clean_str(row['Publisher']),
+            raw_issn=row['ISSN'][:9],
+            custom_id=row.get('NA Pub Cat ID').strip() or None,
+            langs=[parse_lang(row['Pub Language'])],
+            extra=extra,
+        )
+        return info
+
diff --git a/chocula_tool.py b/chocula_tool.py
deleted file mode 100755
index 7dfe80e..0000000
--- a/chocula_tool.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Count Chocula - online serials metadata and stats
-
-  "one, two, three, un-preserved web-native open-access long-tail indie
-  journals, hah, hah, hah!"
-
-  (yeah, I know, this name isn't very good)
-  (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)
-
-Commands:
-
-    everything
-    init_db
-    summarize
-    export
-    export_fatcat
-
-    index_doaj
-    index_road
-    index_crossref
-    index_entrez
-    index_norwegian
-    index_szczepanski
-    index_ezb
-    index_wikidata
-    index_openapc
-
-    load_fatcat
-    load_fatcat_stats
-
-    export_urls
-    update_url_status
-
-Future commands:
-
-    index_jurn
-    index_datacite
-    preserve_kbart --keeper SLUG
-    preserve_sim
-
-See TODO.md for more work-in-progress
-"""
-
-import sys
-import csv
-import argparse
-
-from chocula import ChoculaDatabase, ChoculaConfig
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    subparsers = parser.add_subparsers()
-
-    parser.add_argument("--db-file",
-        help="run in mode that considers only terminal HTML success",
-        default='chocula.sqlite',
-        type=str)
-    parser.add_argument("--input-file",
-        help="override default input file path",
-        default=None,
-        type=str)
-
-    sub = subparsers.add_parser('everything',
-        help="run all the commands")
-    sub.set_defaults(func='everything')
-
-    sub = subparsers.add_parser('init_db',
-        help="create sqlite3 output file and tables")
-    sub.set_defaults(func='init_db')
-
-    sub = subparsers.add_parser('summarize',
-        help="aggregate metadata from all tables into 'journals' table")
-    sub.set_defaults(func='summarize')
-
-    sub = subparsers.add_parser('export',
-        help="dump JSON output")
-    sub.set_defaults(func='export')
-
-    sub = subparsers.add_parser('export_fatcat',
-        help="dump JSON output in a format that can load into fatcat")
-    sub.set_defaults(func='export_fatcat')
-
-    # TODO: 'jurn'
-    for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'):
-        sub = subparsers.add_parser('index_{}'.format(ind),
-            help="load metadata from {}".format(ind))
-        sub.set_defaults(func='index_{}'.format(ind))
-
-    sub = subparsers.add_parser('load_fatcat',
-        help="load fatcat container metadata")
-    sub.set_defaults(func='load_fatcat')
-
-    sub = subparsers.add_parser('load_fatcat_stats',
-        help="update container-level stats from JSON file")
-    sub.set_defaults(func='load_fatcat_stats')
-
-    sub = subparsers.add_parser('export_urls',
-        help="dump homepage URLs (eg, to crawl for status)")
-    sub.set_defaults(func='export_urls')
-
-    sub = subparsers.add_parser('update_url_status',
-        help="import homepage URL crawl status")
-    sub.set_defaults(func='update_url_status')
-
-    args = parser.parse_args()
-    if not args.__dict__.get("func"):
-        print("tell me what to do! (try --help)")
-        sys.exit(-1)
-
-    config = ChoculaConfig.from_file()
-    cdb = ChoculaDatabase(args.db_file)
-    if args.func.startswith('index_') or args.func in ('everything','summarize',):
-        cdb.read_issn_map_file(config.issnl.filepath)
-    func = getattr(cdb, args.func)
-    func(args)
-
-if __name__ == '__main__':
-    main()
-
diff --git a/sources.toml b/sources.toml
index e824e39..44ad219 100644
--- a/sources.toml
+++ b/sources.toml
@@ -1,6 +1,6 @@
 
 [snapshot]
-ia_item = "chocula-sources-snapshot-2020-05-08"
+ia_item = "chocula-sources-snapshot-2020-05-29"
 
 [issnl]
 date = "2019-12-20"
@@ -109,7 +109,11 @@ filename = "openapc.csv"
 original_url = "https://github.com/OpenAPC/openapc-de/blob/master/data/apc_de.csv"
 mirror_url = "https://archive.org/download/openapc-dataset"
 
-[fatcat_container]
+[sim]
+date = "2019"
+filename = "sim_master_title_metadata.csv"
+
+[fatcat_containers]
 date = "2019-12-13"
 filename = "container_export.json"
 
diff --git a/tests/test_directories.py b/tests/test_directories.py
index 37c6109..90856bc 100644
--- a/tests/test_directories.py
+++ b/tests/test_directories.py
@@ -19,9 +19,7 @@ def database(issn_db):
 
 def test_all(config, database):
 
-    for cls in (CrossrefLoader, DoajLoader, EntrezLoader, EzbLoader,
-                GoldOALoader, NorwegianLoader, OpenAPCLoader, RoadLoader,
-                SherpaRomeoLoader, SzczepanskiLoader, WikidataLoader):
+    for cls in ALL_CHOCULA_DIR_CLASSES:
         loader = cls(config)
         counts = loader.index_file(database)
         assert counts['total'] >= 20
-- 
cgit v1.2.3