diff options
-rw-r--r-- | chocula/__init__.py | 3 | ||||
-rw-r--r-- | chocula/common.py | 126 | ||||
-rw-r--r-- | chocula/database.py | 60 | ||||
-rw-r--r-- | chocula/kbart.py | 43 |
4 files changed, 169 insertions, 63 deletions
diff --git a/chocula/__init__.py b/chocula/__init__.py index 440e7a5..38e61c8 100644 --- a/chocula/__init__.py +++ b/chocula/__init__.py @@ -1,5 +1,6 @@ from chocula.config import ChoculaConfig -from chocula.directories import * from chocula.database import ChoculaDatabase, IssnDatabase +from chocula.directories import * +from chocula.kbart import * diff --git a/chocula/common.py b/chocula/common.py index f515e6f..3c8761f 100644 --- a/chocula/common.py +++ b/chocula/common.py @@ -1,11 +1,19 @@ import sys -from typing import Iterable, Optional +import csv +import datetime +from typing import Iterable, Optional, Dict, Any, List from collections import Counter +from dataclasses import dataclass, field +import ftfy + +from chocula.util import clean_str, clean_issn, merge_spans from chocula.config import ChoculaConfig -from chocula.database import DirectoryInfo +from chocula.database import DirectoryInfo, IssnDatabase, HomepageUrl + +THIS_YEAR = datetime.date.today().year class DirectoryLoader(): @@ -34,6 +42,22 @@ class DirectoryLoader(): db.db.commit() return counts +@dataclass +class KbartRecord: + issnl: Optional[str] + issne: Optional[str] + issnp: Optional[str] + title: Optional[str] + publisher: Optional[str] + start_year: Optional[int] + end_year: Optional[int] + start_volume: Optional[str] + end_volume: Optional[str] + url: Optional[HomepageUrl] + embargo: Optional[str] + year_spans: List[Any] + + class KbartLoader(): source_slug: str = "GENERIC" @@ -41,22 +65,102 @@ class KbartLoader(): def __init__(self, config: ChoculaConfig): self.config = config - def open_file(self) -> Iterable: + def file_path(self) -> str: + #return self.config.TEMPLATE.filepath) raise NotImplementedError() - def parse_record(self, record) -> Optional[DirectoryInfo]: - raise NotImplementedError() + def open_file(self) -> Iterable: + raw_file = open(self.file_path(), 'rb').read().decode(errors='replace') + fixed_file = ftfy.fix_text(raw_file) + reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t') + return reader + + def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]: + + issne: Optional[str] = clean_issn(row['online_identifier']) + issnp: Optional[str] = clean_issn(row['print_identifier']) + issnl: Optional[str] = None + if issne: + issnl = issn_db.issn2issnl(issne) + if issnp and not issnl: + issnl = issn_db.issn2issnl(issnp) + start_year: Optional[int] = None + end_year: Optional[int] = None + if row['date_first_issue_online']: + start_year = int(row['date_first_issue_online'][:4]) + if row['date_last_issue_online']: + end_year = int(row['date_last_issue_online'][:4]) + end_volume = row['num_last_vol_online'] + # hack to handle open-ended preservation + if end_year is None and '(present)' in end_volume: + end_year = THIS_YEAR + record = KbartRecord( + issnl=issnl, + issnp=issnp, + issne=issne, + title=clean_str(row['publication_title']), + publisher=clean_str(row['publisher_name']), + url=HomepageUrl.from_url(row['title_url']), + embargo=row['embargo_info'] or None, + start_year=start_year, + end_year=end_year, + start_volume=row['num_first_vol_online'], + end_volume=row['num_last_vol_online'], + year_spans=[], + ) + return record def index_file(self, db) -> Counter: + """ + Transforms a KBART file into a dict of dicts; but basically a list of + JSON objects, one per journal. KBART files can have multiple rows per + journal (eg, different year spans), which is why this pass is needed. + """ print(f"##### Loading {self.source_slug} KBART...", file=sys.stderr) counts: Counter = Counter() - cur = db.db.cursor() - for record in self.open_file(): + kbart_dict: Dict[str, KbartRecord] = dict() + for row in self.open_file(): counts['total'] += 1 - info = self.parse_record(record) - if info: - status = db.insert_directory(info, cur=cur) - counts[status] += 1 + + record = self.parse_record(row, db.issn_db) + if record is None: + counts['skip-parse'] += 1 + continue + elif not record.issnl: + counts['skip-issnl'] += 1 + continue + elif record.start_year is None or record.end_year is None: + counts['missing-years'] += 1 + counts['parsed'] += 1 + + existing = kbart_dict.get(record.issnl, record) + if record.start_year and record.end_year: + old_spans = existing.year_spans or [] + if not record.start_year <= record.end_year: + new_spans = [[record.end_year, record.start_year]] + else: + new_spans = [[record.start_year, record.end_year]] + record.year_spans = merge_spans(old_spans, new_spans) + kbart_dict[record.issnl] = record + + counts['unique-issnl'] = len(kbart_dict) + cur = db.db.cursor() + for issnl, record in kbart_dict.items(): + info = DirectoryInfo( + directory_slug=self.source_slug, + issnl=record.issnl, + issne=record.issne, + issnp=record.issnp, + name=record.title, + publisher=record.publisher, + homepage_urls=[], + extra=dict(year_spans=record.year_spans), + ) + if record.url: + info.homepage_urls.append(record.url) + status = db.insert_directory(info, cur=cur) + counts[status] += 1 cur.close() db.db.commit() return counts + diff --git a/chocula/database.py b/chocula/database.py index 12ac824..dd54448 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -272,55 +272,6 @@ class ChoculaDatabase(): return "inserted" - def parse_kbart(self, name, path) -> Counter: - """ - Transforms a KBART file into a dict of dicts; but basically a list of - JSON objects, one per journal. KBART files can have multiple rows per - journal (eg, different year spans), which is why this pass is needed. - """ - print("##### Parsing KBART file for {}...".format(name)) - #publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name - kbart_dict: Dict[str, Any] = dict() - raw_file = open(path, 'rb').read().decode(errors='replace') - fixed_file = ftfy.fix_text(raw_file) - reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t') - counts: Counter = Counter() - for row in reader: - if not row['print_identifier'] and not row['online_identifier']: - counts['no-issn'] += 1 - continue - issnl, status = self.issn_db.lookup_issnl( - issnp=row['print_identifier'], - issne=row['online_identifier'], - ) - counts[status] += 1 - if not issnl: - continue - - info = dict( - title=row['publication_title'] or None, - publisher=row['publisher_name'] or None, - url=row['title_url'] or None, - embargo_info=row['embargo_info'] or None, - ) - - d = kbart_dict.get(issnl, info) - - old_spans = d.get('year_spans', []) - if row['date_first_issue_online'] and row['date_last_issue_online']: - start = int(row['date_first_issue_online'][:4]) - end = int(row['date_last_issue_online'][:4]) - if not start <= end: - print("{}: {} not before {}! er, mangling".format( - issnl, - row['date_first_issue_online'], - row['date_last_issue_online'])) - new_spans = [[end, start]] - else: - new_spans = [[start, end]] - d['year_spans'] = merge_spans(old_spans, new_spans) - return counts - def load_homepage_status(self, config: ChoculaConfig) -> Counter: print("##### Loading IA Homepage Crawl Results...") counts: Counter = Counter() @@ -673,11 +624,18 @@ class ChoculaDatabase(): if drow['slug'] == 'ezb': ezb = json.loads(drow['extra']) extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color']) - if drow['slug'] == 'szczepanski': + elif drow['slug'] == 'szczepanski': # TODO: what to put here? extra['szczepanski'] = drow['extra'] - if drow['slug'] == 'doaj': + elif drow['slug'] == 'doaj': extra['doaj'] = json.loads(drow['extra']) + elif drow['slug'] == 'sim': + extra['ia'] = extra.get('ia', {}) + extra['ia']['sim'] = json.loads(drow['extra']) + extra['ia']['sim']['sim_pubid'] = drow['identifier'] + elif drow['slug'] in ('lockss', 'clockss', 'portico', 'jstor'): + extra['kbart'] = extra.get('kbart', {}) + extra['kbart'][drow['slug']] = json.loads(drow['extra']) out['extra'] = extra print(json.dumps(out)) diff --git a/chocula/kbart.py b/chocula/kbart.py new file mode 100644 index 0000000..6c1f580 --- /dev/null +++ b/chocula/kbart.py @@ -0,0 +1,43 @@ + +from typing import List, Any +from chocula.common import KbartLoader + + +class ClockssKbartLoader(KbartLoader): + + source_slug = "clockss" + + def file_path(self) -> str: + return self.config.clockss.filepath + + +class LockssKbartLoader(KbartLoader): + + source_slug = "lockss" + + def file_path(self) -> str: + return self.config.lockss.filepath + + +class PorticoKbartLoader(KbartLoader): + + source_slug = "portico" + + def file_path(self) -> str: + return self.config.portico.filepath + + +class JstorKbartLoader(KbartLoader): + + source_slug = "jstor" + + def file_path(self) -> str: + return self.config.jstor.filepath + + +ALL_CHOCULA_KBART_CLASSES = [ + ClockssKbartLoader, + LockssKbartLoader, + PorticoKbartLoader, + JstorKbartLoader, +] |