aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-02 10:37:39 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-02 10:37:39 -0700
commit6bd5d499352ad195a5d91a8344ed57394708ea0a (patch)
treeab81a7d8be7e36247b955905178a844b8e48c323
parentefcbbea0bcd2b2c418652e9e06042ebd247d4ad0 (diff)
downloadchocula-6bd5d499352ad195a5d91a8344ed57394708ea0a.tar.gz
chocula-6bd5d499352ad195a5d91a8344ed57394708ea0a.zip
hathitrust KBART-style importer
-rw-r--r--chocula/__main__.py1
-rw-r--r--chocula/common.py87
-rw-r--r--chocula/database.py9
-rw-r--r--chocula/kbart.py11
-rw-r--r--sources.toml6
-rw-r--r--tests/files/ISSN-to-ISSN-L.txt10
-rw-r--r--tests/files/hathi_serials.tsv30
7 files changed, 152 insertions, 2 deletions
diff --git a/chocula/__main__.py b/chocula/__main__.py
index ebd4120..3856f1b 100644
--- a/chocula/__main__.py
+++ b/chocula/__main__.py
@@ -41,6 +41,7 @@ Commands:
lockss
portico
pkp_pln
+ hathitrust
See TODO.md for more work-in-progress
"""
diff --git a/chocula/common.py b/chocula/common.py
index 763997b..edd48a3 100644
--- a/chocula/common.py
+++ b/chocula/common.py
@@ -216,3 +216,90 @@ class OnixCsvLoader(KbartLoader):
year_spans=[],
)
return record
+
+
+class HathifilesLoader(KbartLoader):
+ """
+ Similar to the KBART loader class, but for Hathifiles bulk format.
+
+ Relavent TSV columns ("one-indexed", not zero-indexed):
+
+ - 2 access (allow=bright, deny=dark)
+ - 5 description
+ - 10 issn ("multiple values separated by comma")
+ - 12 title (if translated, separated by equals or slash)
+ - 13 imprint (publisher and year; often "publisher, year")
+ - 17 rights_date_used (year; 9999=unknown)
+ - 19 lang (MARC format)
+ """
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(
+ open(self.file_path(), "r"),
+ delimiter="\t",
+ fieldnames=[
+ "htid",
+ "access",
+ "rights",
+ "ht_bib_key",
+ "description",
+ "source",
+ "source_bib_num",
+ "oclc_num",
+ "isbn",
+ "issn",
+ "lccn",
+ "title",
+ "imprint",
+ "rights_reason_code",
+ "rights_timestamp",
+ "us_gov_doc_flag",
+ "rights_date_used",
+ "pub_place",
+ "lang",
+ "bib_fmt",
+ "collection_code",
+ "content_provider_code",
+ "responsible_entity_code",
+ "digitization_agent_code",
+ "access_profile_code",
+ "author",
+ ],
+ )
+
+ def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]:
+
+ # unpack fields
+ # access = dict(allow="bright", deny="dark")[row['access']]
+ raw_issn = clean_issn(row["issn"].split(",")[0])
+ imprint = clean_str(row["imprint"])
+ raw_date = row["rights_date_used"].strip()
+
+ issnl = issn_db.issn2issnl(raw_issn or "")
+
+ rights_date: Optional[int] = None
+ if raw_date.isdigit():
+ rights_date = int(raw_date)
+ start_year: Optional[int] = rights_date
+ if start_year == 9999:
+ start_year = None
+
+ publisher: Optional[str] = None
+ if imprint:
+ publisher = imprint.split(".")[0].split(",")[0].split("[")[0].strip()
+
+ record = KbartRecord(
+ issnl=issnl,
+ issne=None,
+ issnp=None,
+ embargo=None,
+ title=clean_str(row["title"]),
+ publisher=publisher,
+ url=None,
+ start_year=start_year,
+ end_year=start_year,
+ start_volume=None,
+ end_volume=None,
+ year_spans=[],
+ )
+ return record
diff --git a/chocula/database.py b/chocula/database.py
index f70f4a6..c432b76 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -829,7 +829,14 @@ class ChoculaDatabase:
):
if dextra.get(k) is not None:
extra["ia"]["sim"][k] = dextra[k]
- elif drow["slug"] in ("lockss", "clockss", "portico", "jstor", "pkp_pln"):
+ elif drow["slug"] in (
+ "lockss",
+ "clockss",
+ "portico",
+ "jstor",
+ "pkp_pln",
+ "hathitrust",
+ ):
extra["kbart"] = extra.get("kbart", {})
extra["kbart"][drow["slug"]] = dict(year_spans=dextra["year_spans"])
if dextra.get("abbrev"):
diff --git a/chocula/kbart.py b/chocula/kbart.py
index 5fd0acc..3944430 100644
--- a/chocula/kbart.py
+++ b/chocula/kbart.py
@@ -1,4 +1,4 @@
-from chocula.common import KbartLoader, OnixCsvLoader
+from chocula.common import KbartLoader, OnixCsvLoader, HathifilesLoader
class ClockssKbartLoader(KbartLoader):
@@ -41,10 +41,19 @@ class PkpPlnOnixLoader(OnixCsvLoader):
return self.config.pkp_pln.filepath
+class HathitrustLoader(HathifilesLoader):
+
+ source_slug = "hathitrust"
+
+ def file_path(self) -> str:
+ return self.config.hathitrust.filepath
+
+
ALL_CHOCULA_KBART_CLASSES = [
ClockssKbartLoader,
LockssKbartLoader,
PorticoKbartLoader,
JstorKbartLoader,
PkpPlnOnixLoader,
+ HathitrustLoader,
]
diff --git a/sources.toml b/sources.toml
index faadb47..c9060f7 100644
--- a/sources.toml
+++ b/sources.toml
@@ -87,6 +87,12 @@ original_url = "https://www.jstor.org/kbart/collections/all-archive-titles?conte
filename = "onix_pkp_pln.csv"
original_url = "http://pkp.sfu.ca/files/pkppn/onix.csv"
+[hathitrust]
+# see notes/hathitrust.md
+date = "2020-08-01"
+filename = "hathi_serials.tsv"
+original_url = "https://www.hathitrust.org/hathifiles"
+
[szczepanski]
date = '2018'
# Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json
diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt
index 73f4629..72db455 100644
--- a/tests/files/ISSN-to-ISSN-L.txt
+++ b/tests/files/ISSN-to-ISSN-L.txt
@@ -366,3 +366,13 @@ ISSN ISSN-L
1715-0868 1715-0868
2215-2075 2215-2075
1988-8325 1988-8325
+0010-4523 0010-4523
+0002-161X 0002-161X
+0768-5475 0768-5475
+0970-4728 0970-4728
+0372-9192 0372-9192
+0190-6313 0190-6313
+0892-8266 0892-8266
+0065-8170 0065-8170
+0068-1202 0068-1202
+0042-465x 0042-465x
diff --git a/tests/files/hathi_serials.tsv b/tests/files/hathi_serials.tsv
new file mode 100644
index 0000000..5200062
--- /dev/null
+++ b/tests/files/hathi_serials.tsv
@@ -0,0 +1,30 @@
+uva.x002429058 deny ic 000023255 v.10 1993 UVA u1340113 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2017-05-22 03:25:50 0 1993 hu scr SE UVA virginia virginia google google
+uva.x001495284 deny ic 000023255 v.6-8 1984-87 UVA u1340113 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2017-05-22 03:25:50 0 1987 hu scr SE UVA virginia virginia google google
+uva.x001542254 deny ic 000023255 v.3-5 1979-82 UVA u1340113 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2017-05-22 03:25:50 0 1982 hu scr SE UVA virginia virginia google google
+mdp.39015022896545 deny ic 000023255 v.7-10 1985-1993 MIU 000023255 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2011-04-06 04:30:09 0 1993 hu scr SE MIU umich umich google google
+mdp.39015068919623 deny ic 000023255 v.3-6 1979-1984 MIU 000023255 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2008-08-23 22:30:06 0 1984 hu scr SE MIU umich umich google google
+mdp.39015068919615 deny ic 000023255 v.1-2 1975-1977 MIU 000023255 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2012-07-25 19:30:17 0 1977 hu scr SE MIU umich umich google google
+mdp.39015012888866 deny und 000024752 v.1 MIU 000024752 2428173 1054-7533 70103816 Experimental cinema. Arno Press 1969. bib 2012-06-21 19:30:09 0 1934 nyu eng SE MIU umich umich google google
+wu.89098742208 deny ic 000030973 1943-1944 WU 4351521 1644142 45004730 Best film plays of ... Crown Publishers, 1944- bib 2010-04-25 19:31:25 0 1944 nyu eng SE WU wisc wisc google google
+mdp.39015021085272 deny ic 000045215 v.1 1962 MIU 000045215 2083094 0010-4523 62014777 Computer applications service. American Data Processing] bib 2007-12-20 09:30:04 0 1962 miu eng SE MIU umich umich google google
+mdp.39015021085280 deny und 000045215 v.2-4 MIU 000045215 2083094 0010-4523 62014777 Computer applications service. American Data Processing] bib 2009-06-12 22:30:30 0 9999 miu eng SE MIU umich umich google google
+umn.31951d02948236r allow pd 000599667 v.52:no.11 (2004) UMN 002307673 1478561 0002-161X 0002-161X agr53000137 //r83 Agricultural research / U.S. Department of Agriculture. Science and Education Administration], U.S. Dept. of Agriculture : [Supt. of Docs., U.S. G.P.O., distributor, bib 2013-04-19 05:25:40 1 2004 dcu eng SE UMN umn umn google google
+umn.31951001907305l allow pdus 000056498 fasc.27-28 (1921) UMN 001499770 1755884 0768-5475 Bibliothèque de la Faculté de philosophie et lettres de l'Université de Liège. Les Belles lettres, 1897- bib 2015-01-10 03:25:14 0 1921 be fre SE UMN umn umn google google
+mdp.39015079651769 deny ic 003051832 v.33 no.3-4 1997-1998 MIU 003051832 26905188 0970-4728 92650249 ILA bulletin. Indian Library Association, 1975- bib 2009-09-07 22:30:12 0 1998 ii eng SE MIU umich umich google google
+mdp.39015074028492 deny und 000500090 v.49 miu 000500090 1770485 0372-9192 46018870 zeitschrift für klinische medizin. springer-verlag. bib 2012-11-16 04:30:03 0 1965 gw ger se miu umich umich google google
+coo.31924059002018 deny ic 000636079 78th 1991 coo 1649072 1768174 0190-6313 15026170 annual report / board of governors of the federal reserve system board of governors of the federal reserve system, [1967]- bib 2013-02-20 04:31:57 0 1991 dcu eng se coo cornell cornell google google board of governors of the federal reserve system (u.s.)
+mdp.39015076621310 allow pd 000637343 v.4,7-8,14 1854,1857-1858,1863 miu 000637343 1757026 0892-8266 07041920 transactions of the medical society of the state of pennsylvania at its ... annual session. the society. bib 2013-08-09 20:26:57 0 1863 pau eng se miu umich umich lit-dlps-dc open medical society of the state of pennsylvania.
+chi.72660825 allow pdus 000505286 c.1 no.3-6 chi 290494 1479819 0065-8170 52033049 memoirs of the american entomological society. american entomological society. gfv 2012-04-29 01:02:29 0 9999 pau eng se chi uchicago uchicago google google american entomological society.
+mdp.39015024068416 deny ic 000528005 v.76 1991 miu 000528005 1772818 0068-1202 07036968 //r493 proceedings of the british academy. published for the british academy by g. cumberlege, oxford university press, bib 2012-06-21 19:30:18 0 1991 enk eng se miu umich umich google google
+uc1.b4106780 deny ic 010010552 1961 uc .b17076882x 4128023 0042-465x sn 86012760 vestnik oftalʹmologii. medit︠s︡ina. bib 2011-07-13 20:30:35 0 1961 ru rus se nrlf universityofcalifornia universityofcalifornia google google
+njp.32101077271102 allow pd 009034747 ser.3, v.7 (apr. 1905-jan. 1907) njp 628627 1481386 0003-4827 05032209 //r822 annals of iowa. iowa state historical dept., division of historical museum and archives, bib 2012-10-27 19:30:39 0 1907 iau eng se njp princeton princeton google google
+mdp.39015068664518 deny ic 002866340 v.22-26 2002-2006 miu 002866340 12251337 0971-0388 87909513 the aligarh journal of statistics. dept. of statistics, aligarh muslim university, bib 2010-04-01 20:31:04 0 2006 ii eng se miu umich umich google google
+uc1.$b771057 allow pd 010690843 no.660-666 1950 uc .b157201983 2636094 0041-767x 86655128 united states government publications monthly catalog / issued by the superintendent of documents. u.s. g.p.o. ; for sale by the supt. of docs., 1940-1951. bib 2020-05-29 03:25:03 1 1950 dcu eng se nrlf universityofcalifornia universityofcalifornia google google
+uc1.31822009591405 deny und 000521949 v.18 uc .b27054986 3402040 0021-1311 the irish naturalists' journal. i.n.j. committee. bib 2010-08-12 20:31:43 0 9999 nik eng se ucsd universityofcalifornia universityofcalifornia google google
+pst.000046424143 deny ic 012255615 1st.ed. 2000 pst a2122089 42836372 1527-4837 sn 99009894 fodor's naples and the amalfi coast. fodor's travel publications, incorporated, c2000- bib 2014-11-05 10:28:06 0 2000 xx eng se pst psu psu google google
+uc1.32106007707885 deny und 000067489 v.6:1-5 uc .b15250313 8049660 0730-7004 88654892 american health. american health partners, c1982-. bib 2010-08-21 20:31:35 0 1982 eng se ucsc universityofcalifornia universityofcalifornia google google
+uc1.b3540645 deny ic 000640615 v.7 (1944) uc .b126675284 1566509 0012-1223 40012538 //r52 deutsches archiv für erforschung des mittelalters. böhlau. bib 2012-07-19 04:30:58 0 1944 gw ger se nrlf universityofcalifornia universityofcalifornia google google
+mdp.39015069542184 deny und 000548966 no.301-319 MIU 000548966 20910946 0369-9870 90656505 Prace Instytutu Badawczego Leśnictwa. Państwowe Wydawn. Rolnicze i Leśne, 1958-2000. bib 2010-05-24 22:30:40 0 2000 pl pol SE MIU umich umich google google
+pst.000054405837 allow pd 009445702 1951-1958 PST a108888 1606613 2010229450 Evaporated, condensed, and dry milk report. Dept. of Agriculture. bib 2018-12-16 03:25:14 1 1958 dcu eng SE PST psu psu google google
+mdp.39015013030138 deny ic 000546404 v.9 1981 46257-67357 MIU 000546404 3837820 0162-704X 78645419 Conference papers index. Data Courier, Inc. bib 2013-04-30 19:25:08 0 1981 kyu eng SE MIU umich umich google google
+umn.31951p00389299k deny ic 002543035 no.16-24 1989-93 UMN 001000958 20951089 1050-2351 sn 90014073 Latin American population history bulletin. Dept. of History, University of Minnesota, c1989- bib 2015-02-12 03:26:52 0 1993 mnu eng SE UMN umn umn google google