aboutsummaryrefslogtreecommitdiffstats
path: root/chocula
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-02 10:37:39 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-02 10:37:39 -0700
commit6bd5d499352ad195a5d91a8344ed57394708ea0a (patch)
treeab81a7d8be7e36247b955905178a844b8e48c323 /chocula
parentefcbbea0bcd2b2c418652e9e06042ebd247d4ad0 (diff)
downloadchocula-6bd5d499352ad195a5d91a8344ed57394708ea0a.tar.gz
chocula-6bd5d499352ad195a5d91a8344ed57394708ea0a.zip
hathitrust KBART-style importer
Diffstat (limited to 'chocula')
-rw-r--r--chocula/__main__.py1
-rw-r--r--chocula/common.py87
-rw-r--r--chocula/database.py9
-rw-r--r--chocula/kbart.py11
4 files changed, 106 insertions, 2 deletions
diff --git a/chocula/__main__.py b/chocula/__main__.py
index ebd4120..3856f1b 100644
--- a/chocula/__main__.py
+++ b/chocula/__main__.py
@@ -41,6 +41,7 @@ Commands:
lockss
portico
pkp_pln
+ hathitrust
See TODO.md for more work-in-progress
"""
diff --git a/chocula/common.py b/chocula/common.py
index 763997b..edd48a3 100644
--- a/chocula/common.py
+++ b/chocula/common.py
@@ -216,3 +216,90 @@ class OnixCsvLoader(KbartLoader):
year_spans=[],
)
return record
+
+
+class HathifilesLoader(KbartLoader):
+ """
+ Similar to the KBART loader class, but for Hathifiles bulk format.
+
+ Relavent TSV columns ("one-indexed", not zero-indexed):
+
+ - 2 access (allow=bright, deny=dark)
+ - 5 description
+ - 10 issn ("multiple values separated by comma")
+ - 12 title (if translated, separated by equals or slash)
+ - 13 imprint (publisher and year; often "publisher, year")
+ - 17 rights_date_used (year; 9999=unknown)
+ - 19 lang (MARC format)
+ """
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(
+ open(self.file_path(), "r"),
+ delimiter="\t",
+ fieldnames=[
+ "htid",
+ "access",
+ "rights",
+ "ht_bib_key",
+ "description",
+ "source",
+ "source_bib_num",
+ "oclc_num",
+ "isbn",
+ "issn",
+ "lccn",
+ "title",
+ "imprint",
+ "rights_reason_code",
+ "rights_timestamp",
+ "us_gov_doc_flag",
+ "rights_date_used",
+ "pub_place",
+ "lang",
+ "bib_fmt",
+ "collection_code",
+ "content_provider_code",
+ "responsible_entity_code",
+ "digitization_agent_code",
+ "access_profile_code",
+ "author",
+ ],
+ )
+
+ def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]:
+
+ # unpack fields
+ # access = dict(allow="bright", deny="dark")[row['access']]
+ raw_issn = clean_issn(row["issn"].split(",")[0])
+ imprint = clean_str(row["imprint"])
+ raw_date = row["rights_date_used"].strip()
+
+ issnl = issn_db.issn2issnl(raw_issn or "")
+
+ rights_date: Optional[int] = None
+ if raw_date.isdigit():
+ rights_date = int(raw_date)
+ start_year: Optional[int] = rights_date
+ if start_year == 9999:
+ start_year = None
+
+ publisher: Optional[str] = None
+ if imprint:
+ publisher = imprint.split(".")[0].split(",")[0].split("[")[0].strip()
+
+ record = KbartRecord(
+ issnl=issnl,
+ issne=None,
+ issnp=None,
+ embargo=None,
+ title=clean_str(row["title"]),
+ publisher=publisher,
+ url=None,
+ start_year=start_year,
+ end_year=start_year,
+ start_volume=None,
+ end_volume=None,
+ year_spans=[],
+ )
+ return record
diff --git a/chocula/database.py b/chocula/database.py
index f70f4a6..c432b76 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -829,7 +829,14 @@ class ChoculaDatabase:
):
if dextra.get(k) is not None:
extra["ia"]["sim"][k] = dextra[k]
- elif drow["slug"] in ("lockss", "clockss", "portico", "jstor", "pkp_pln"):
+ elif drow["slug"] in (
+ "lockss",
+ "clockss",
+ "portico",
+ "jstor",
+ "pkp_pln",
+ "hathitrust",
+ ):
extra["kbart"] = extra.get("kbart", {})
extra["kbart"][drow["slug"]] = dict(year_spans=dextra["year_spans"])
if dextra.get("abbrev"):
diff --git a/chocula/kbart.py b/chocula/kbart.py
index 5fd0acc..3944430 100644
--- a/chocula/kbart.py
+++ b/chocula/kbart.py
@@ -1,4 +1,4 @@
-from chocula.common import KbartLoader, OnixCsvLoader
+from chocula.common import KbartLoader, OnixCsvLoader, HathifilesLoader
class ClockssKbartLoader(KbartLoader):
@@ -41,10 +41,19 @@ class PkpPlnOnixLoader(OnixCsvLoader):
return self.config.pkp_pln.filepath
+class HathitrustLoader(HathifilesLoader):
+
+ source_slug = "hathitrust"
+
+ def file_path(self) -> str:
+ return self.config.hathitrust.filepath
+
+
ALL_CHOCULA_KBART_CLASSES = [
ClockssKbartLoader,
LockssKbartLoader,
PorticoKbartLoader,
JstorKbartLoader,
PkpPlnOnixLoader,
+ HathitrustLoader,
]