4 files changed, 169 insertions, 63 deletions
diff --git a/chocula/__init__.py b/chocula/__init__.py
index 440e7a5..38e61c8 100644
--- a/chocula/__init__.py
+++ b/chocula/__init__.py
@@ -1,5 +1,6 @@
 
 from chocula.config import ChoculaConfig
-from chocula.directories import *
 from chocula.database import ChoculaDatabase, IssnDatabase
+from chocula.directories import *
+from chocula.kbart import *
 
diff --git a/chocula/common.py b/chocula/common.py
index f515e6f..3c8761f 100644
--- a/chocula/common.py
+++ b/chocula/common.py
@@ -1,11 +1,19 @@
 
 import sys
-from typing import Iterable, Optional
+import csv
+import datetime
+from typing import Iterable, Optional, Dict, Any, List
 from collections import Counter
+from dataclasses import dataclass, field
 
+import ftfy
+
+from chocula.util import clean_str, clean_issn, merge_spans
 from chocula.config import ChoculaConfig
-from chocula.database import DirectoryInfo
+from chocula.database import DirectoryInfo, IssnDatabase, HomepageUrl
+
 
+THIS_YEAR = datetime.date.today().year
 
 class DirectoryLoader():
 
@@ -34,6 +42,22 @@ class DirectoryLoader():
         db.db.commit()
         return counts
 
+@dataclass
+class KbartRecord:
+    issnl: Optional[str]
+    issne: Optional[str]
+    issnp: Optional[str]
+    title: Optional[str]
+    publisher: Optional[str]
+    start_year: Optional[int]
+    end_year: Optional[int]
+    start_volume: Optional[str]
+    end_volume: Optional[str]
+    url: Optional[HomepageUrl]
+    embargo: Optional[str]
+    year_spans: List[Any]
+
+
 class KbartLoader():
 
     source_slug: str = "GENERIC"
@@ -41,22 +65,102 @@ class KbartLoader():
     def __init__(self, config: ChoculaConfig):
         self.config = config
 
-    def open_file(self) -> Iterable:
+    def file_path(self) -> str:
+        #return self.config.TEMPLATE.filepath)
         raise NotImplementedError()
 
-    def parse_record(self, record) -> Optional[DirectoryInfo]:
-        raise NotImplementedError()
+    def open_file(self) -> Iterable:
+        raw_file = open(self.file_path(), 'rb').read().decode(errors='replace')
+        fixed_file = ftfy.fix_text(raw_file)
+        reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t')
+        return reader
+
+    def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]:
+
+        issne: Optional[str] = clean_issn(row['online_identifier'])
+        issnp: Optional[str] = clean_issn(row['print_identifier'])
+        issnl: Optional[str] = None
+        if issne:
+            issnl = issn_db.issn2issnl(issne)
+        if issnp and not issnl:
+            issnl = issn_db.issn2issnl(issnp)
+        start_year: Optional[int] = None
+        end_year: Optional[int] = None
+        if row['date_first_issue_online']:
+            start_year = int(row['date_first_issue_online'][:4])
+        if row['date_last_issue_online']:
+            end_year = int(row['date_last_issue_online'][:4])
+        end_volume = row['num_last_vol_online']
+        # hack to handle open-ended preservation
+        if end_year is None and '(present)' in end_volume:
+            end_year = THIS_YEAR
+        record = KbartRecord(
+            issnl=issnl,
+            issnp=issnp,
+            issne=issne,
+            title=clean_str(row['publication_title']),
+            publisher=clean_str(row['publisher_name']),
+            url=HomepageUrl.from_url(row['title_url']),
+            embargo=row['embargo_info'] or None,
+            start_year=start_year,
+            end_year=end_year,
+            start_volume=row['num_first_vol_online'],
+            end_volume=row['num_last_vol_online'],
+            year_spans=[],
+        )
+        return record
 
     def index_file(self, db) -> Counter:
+        """
+        Transforms a KBART file into a dict of dicts; but basically a list of
+        JSON objects, one per journal. KBART files can have multiple rows per
+        journal (eg, different year spans), which is why this pass is needed.
+        """
         print(f"##### Loading {self.source_slug} KBART...", file=sys.stderr)
         counts: Counter = Counter()
-        cur = db.db.cursor()
-        for record in self.open_file():
+        kbart_dict: Dict[str, KbartRecord] = dict()
+        for row in self.open_file():
             counts['total'] += 1
-            info = self.parse_record(record)
-            if info:
-                status = db.insert_directory(info, cur=cur)
-                counts[status] += 1
+
+            record = self.parse_record(row, db.issn_db)
+            if record is None:
+                counts['skip-parse'] += 1
+                continue
+            elif not record.issnl:
+                counts['skip-issnl'] += 1
+                continue
+            elif record.start_year is None or record.end_year is None:
+                counts['missing-years'] += 1
+            counts['parsed'] += 1
+
+            existing = kbart_dict.get(record.issnl, record)
+            if record.start_year and record.end_year:
+                old_spans = existing.year_spans or []
+                if not record.start_year <= record.end_year:
+                    new_spans = [[record.end_year, record.start_year]]
+                else:
+                    new_spans = [[record.start_year, record.end_year]]
+                record.year_spans = merge_spans(old_spans, new_spans)
+            kbart_dict[record.issnl] = record
+
+        counts['unique-issnl'] = len(kbart_dict)
+        cur = db.db.cursor()
+        for issnl, record in kbart_dict.items():
+            info = DirectoryInfo(
+                directory_slug=self.source_slug,
+                issnl=record.issnl,
+                issne=record.issne,
+                issnp=record.issnp,
+                name=record.title,
+                publisher=record.publisher,
+                homepage_urls=[],
+                extra=dict(year_spans=record.year_spans),
+            )
+            if record.url:
+                info.homepage_urls.append(record.url)
+            status = db.insert_directory(info, cur=cur)
+            counts[status] += 1
         cur.close()
         db.db.commit()
         return counts
+
diff --git a/chocula/database.py b/chocula/database.py
index 12ac824..dd54448 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -272,55 +272,6 @@ class ChoculaDatabase():
 
         return "inserted"
 
-    def parse_kbart(self, name, path) -> Counter:
-        """
-        Transforms a KBART file into a dict of dicts; but basically a list of
-        JSON objects, one per journal. KBART files can have multiple rows per
-        journal (eg, different year spans), which is why this pass is needed.
-        """
-        print("##### Parsing KBART file for {}...".format(name))
-        #publication_title      print_identifier        online_identifier       date_first_issue_online num_first_vol_online    num_first_issue_online  date_last_issue_online  num_last_vol_online     num_last_issue_online   title_url       first_author    title_id        embargo_info    coverage_depth  coverage_notes  publisher_name
-        kbart_dict: Dict[str, Any] = dict()
-        raw_file = open(path, 'rb').read().decode(errors='replace')
-        fixed_file = ftfy.fix_text(raw_file)
-        reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t')
-        counts: Counter = Counter()
-        for row in reader:
-            if not row['print_identifier'] and not row['online_identifier']:
-                counts['no-issn'] += 1
-                continue
-            issnl, status = self.issn_db.lookup_issnl(
-                issnp=row['print_identifier'],
-                issne=row['online_identifier'],
-            )
-            counts[status] += 1
-            if not issnl:
-                continue
-
-            info = dict(
-                title=row['publication_title'] or None,
-                publisher=row['publisher_name'] or None,
-                url=row['title_url'] or None,
-                embargo_info=row['embargo_info'] or None,
-            )
-
-            d = kbart_dict.get(issnl, info)
-
-            old_spans = d.get('year_spans', [])
-            if row['date_first_issue_online'] and row['date_last_issue_online']:
-                start = int(row['date_first_issue_online'][:4])
-                end = int(row['date_last_issue_online'][:4])
-                if not start <= end:
-                    print("{}: {} not before {}! er, mangling".format(
-                        issnl,
-                        row['date_first_issue_online'],
-                        row['date_last_issue_online']))
-                    new_spans = [[end, start]]
-                else:
-                    new_spans = [[start, end]]
-                d['year_spans'] = merge_spans(old_spans, new_spans)
-        return counts
-
     def load_homepage_status(self, config: ChoculaConfig) -> Counter:
         print("##### Loading IA Homepage Crawl Results...")
         counts: Counter = Counter()
@@ -673,11 +624,18 @@ class ChoculaDatabase():
                 if drow['slug'] == 'ezb':
                     ezb = json.loads(drow['extra'])
                     extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color'])
-                if drow['slug'] == 'szczepanski':
+                elif drow['slug'] == 'szczepanski':
                     # TODO: what to put here?
                     extra['szczepanski'] = drow['extra']
-                if drow['slug'] == 'doaj':
+                elif drow['slug'] == 'doaj':
                     extra['doaj'] = json.loads(drow['extra'])
+                elif drow['slug'] == 'sim':
+                    extra['ia'] = extra.get('ia', {})
+                    extra['ia']['sim'] = json.loads(drow['extra'])
+                    extra['ia']['sim']['sim_pubid'] = drow['identifier']
+                elif drow['slug'] in ('lockss', 'clockss', 'portico', 'jstor'):
+                    extra['kbart'] = extra.get('kbart', {})
+                    extra['kbart'][drow['slug']] = json.loads(drow['extra'])
 
             out['extra'] = extra
             print(json.dumps(out))
diff --git a/chocula/kbart.py b/chocula/kbart.py
new file mode 100644
index 0000000..6c1f580
--- /dev/null
+++ b/chocula/kbart.py
@@ -0,0 +1,43 @@
+
+from typing import List, Any
+from chocula.common import KbartLoader
+
+
+class ClockssKbartLoader(KbartLoader):
+
+    source_slug = "clockss"
+
+    def file_path(self) -> str:
+        return self.config.clockss.filepath
+
+
+class LockssKbartLoader(KbartLoader):
+
+    source_slug = "lockss"
+
+    def file_path(self) -> str:
+        return self.config.lockss.filepath
+
+
+class PorticoKbartLoader(KbartLoader):
+
+    source_slug = "portico"
+
+    def file_path(self) -> str:
+        return self.config.portico.filepath
+ 
+
+class JstorKbartLoader(KbartLoader):
+
+    source_slug = "jstor"
+
+    def file_path(self) -> str:
+        return self.config.jstor.filepath
+
+
+ALL_CHOCULA_KBART_CLASSES = [
+    ClockssKbartLoader,
+    LockssKbartLoader,
+    PorticoKbartLoader,
+    JstorKbartLoader,
+]