diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-02 10:37:39 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-02 10:37:39 -0700 |
commit | 6bd5d499352ad195a5d91a8344ed57394708ea0a (patch) | |
tree | ab81a7d8be7e36247b955905178a844b8e48c323 /chocula | |
parent | efcbbea0bcd2b2c418652e9e06042ebd247d4ad0 (diff) | |
download | chocula-6bd5d499352ad195a5d91a8344ed57394708ea0a.tar.gz chocula-6bd5d499352ad195a5d91a8344ed57394708ea0a.zip |
hathitrust KBART-style importer
Diffstat (limited to 'chocula')
-rw-r--r-- | chocula/__main__.py | 1 | ||||
-rw-r--r-- | chocula/common.py | 87 | ||||
-rw-r--r-- | chocula/database.py | 9 | ||||
-rw-r--r-- | chocula/kbart.py | 11 |
4 files changed, 106 insertions, 2 deletions
diff --git a/chocula/__main__.py b/chocula/__main__.py index ebd4120..3856f1b 100644 --- a/chocula/__main__.py +++ b/chocula/__main__.py @@ -41,6 +41,7 @@ Commands: lockss portico pkp_pln + hathitrust See TODO.md for more work-in-progress """ diff --git a/chocula/common.py b/chocula/common.py index 763997b..edd48a3 100644 --- a/chocula/common.py +++ b/chocula/common.py @@ -216,3 +216,90 @@ class OnixCsvLoader(KbartLoader): year_spans=[], ) return record + + +class HathifilesLoader(KbartLoader): + """ + Similar to the KBART loader class, but for Hathifiles bulk format. + + Relavent TSV columns ("one-indexed", not zero-indexed): + + - 2 access (allow=bright, deny=dark) + - 5 description + - 10 issn ("multiple values separated by comma") + - 12 title (if translated, separated by equals or slash) + - 13 imprint (publisher and year; often "publisher, year") + - 17 rights_date_used (year; 9999=unknown) + - 19 lang (MARC format) + """ + + def open_file(self) -> Iterable: + return csv.DictReader( + open(self.file_path(), "r"), + delimiter="\t", + fieldnames=[ + "htid", + "access", + "rights", + "ht_bib_key", + "description", + "source", + "source_bib_num", + "oclc_num", + "isbn", + "issn", + "lccn", + "title", + "imprint", + "rights_reason_code", + "rights_timestamp", + "us_gov_doc_flag", + "rights_date_used", + "pub_place", + "lang", + "bib_fmt", + "collection_code", + "content_provider_code", + "responsible_entity_code", + "digitization_agent_code", + "access_profile_code", + "author", + ], + ) + + def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]: + + # unpack fields + # access = dict(allow="bright", deny="dark")[row['access']] + raw_issn = clean_issn(row["issn"].split(",")[0]) + imprint = clean_str(row["imprint"]) + raw_date = row["rights_date_used"].strip() + + issnl = issn_db.issn2issnl(raw_issn or "") + + rights_date: Optional[int] = None + if raw_date.isdigit(): + rights_date = int(raw_date) + start_year: Optional[int] = rights_date + if start_year == 9999: + start_year = None + + publisher: Optional[str] = None + if imprint: + publisher = imprint.split(".")[0].split(",")[0].split("[")[0].strip() + + record = KbartRecord( + issnl=issnl, + issne=None, + issnp=None, + embargo=None, + title=clean_str(row["title"]), + publisher=publisher, + url=None, + start_year=start_year, + end_year=start_year, + start_volume=None, + end_volume=None, + year_spans=[], + ) + return record diff --git a/chocula/database.py b/chocula/database.py index f70f4a6..c432b76 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -829,7 +829,14 @@ class ChoculaDatabase: ): if dextra.get(k) is not None: extra["ia"]["sim"][k] = dextra[k] - elif drow["slug"] in ("lockss", "clockss", "portico", "jstor", "pkp_pln"): + elif drow["slug"] in ( + "lockss", + "clockss", + "portico", + "jstor", + "pkp_pln", + "hathitrust", + ): extra["kbart"] = extra.get("kbart", {}) extra["kbart"][drow["slug"]] = dict(year_spans=dextra["year_spans"]) if dextra.get("abbrev"): diff --git a/chocula/kbart.py b/chocula/kbart.py index 5fd0acc..3944430 100644 --- a/chocula/kbart.py +++ b/chocula/kbart.py @@ -1,4 +1,4 @@ -from chocula.common import KbartLoader, OnixCsvLoader +from chocula.common import KbartLoader, OnixCsvLoader, HathifilesLoader class ClockssKbartLoader(KbartLoader): @@ -41,10 +41,19 @@ class PkpPlnOnixLoader(OnixCsvLoader): return self.config.pkp_pln.filepath +class HathitrustLoader(HathifilesLoader): + + source_slug = "hathitrust" + + def file_path(self) -> str: + return self.config.hathitrust.filepath + + ALL_CHOCULA_KBART_CLASSES = [ ClockssKbartLoader, LockssKbartLoader, PorticoKbartLoader, JstorKbartLoader, PkpPlnOnixLoader, + HathitrustLoader, ] |