From 6bd5d499352ad195a5d91a8344ed57394708ea0a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 2 Sep 2020 10:37:39 -0700 Subject: hathitrust KBART-style importer --- chocula/__main__.py | 1 + chocula/common.py | 87 ++++++++++++++++++++++++++++++++++++++++++ chocula/database.py | 9 ++++- chocula/kbart.py | 11 +++++- sources.toml | 6 +++ tests/files/ISSN-to-ISSN-L.txt | 10 +++++ tests/files/hathi_serials.tsv | 30 +++++++++++++++ 7 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 tests/files/hathi_serials.tsv diff --git a/chocula/__main__.py b/chocula/__main__.py index ebd4120..3856f1b 100644 --- a/chocula/__main__.py +++ b/chocula/__main__.py @@ -41,6 +41,7 @@ Commands: lockss portico pkp_pln + hathitrust See TODO.md for more work-in-progress """ diff --git a/chocula/common.py b/chocula/common.py index 763997b..edd48a3 100644 --- a/chocula/common.py +++ b/chocula/common.py @@ -216,3 +216,90 @@ class OnixCsvLoader(KbartLoader): year_spans=[], ) return record + + +class HathifilesLoader(KbartLoader): + """ + Similar to the KBART loader class, but for Hathifiles bulk format. + + Relavent TSV columns ("one-indexed", not zero-indexed): + + - 2 access (allow=bright, deny=dark) + - 5 description + - 10 issn ("multiple values separated by comma") + - 12 title (if translated, separated by equals or slash) + - 13 imprint (publisher and year; often "publisher, year") + - 17 rights_date_used (year; 9999=unknown) + - 19 lang (MARC format) + """ + + def open_file(self) -> Iterable: + return csv.DictReader( + open(self.file_path(), "r"), + delimiter="\t", + fieldnames=[ + "htid", + "access", + "rights", + "ht_bib_key", + "description", + "source", + "source_bib_num", + "oclc_num", + "isbn", + "issn", + "lccn", + "title", + "imprint", + "rights_reason_code", + "rights_timestamp", + "us_gov_doc_flag", + "rights_date_used", + "pub_place", + "lang", + "bib_fmt", + "collection_code", + "content_provider_code", + "responsible_entity_code", + "digitization_agent_code", + "access_profile_code", + "author", + ], + ) + + def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]: + + # unpack fields + # access = dict(allow="bright", deny="dark")[row['access']] + raw_issn = clean_issn(row["issn"].split(",")[0]) + imprint = clean_str(row["imprint"]) + raw_date = row["rights_date_used"].strip() + + issnl = issn_db.issn2issnl(raw_issn or "") + + rights_date: Optional[int] = None + if raw_date.isdigit(): + rights_date = int(raw_date) + start_year: Optional[int] = rights_date + if start_year == 9999: + start_year = None + + publisher: Optional[str] = None + if imprint: + publisher = imprint.split(".")[0].split(",")[0].split("[")[0].strip() + + record = KbartRecord( + issnl=issnl, + issne=None, + issnp=None, + embargo=None, + title=clean_str(row["title"]), + publisher=publisher, + url=None, + start_year=start_year, + end_year=start_year, + start_volume=None, + end_volume=None, + year_spans=[], + ) + return record diff --git a/chocula/database.py b/chocula/database.py index f70f4a6..c432b76 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -829,7 +829,14 @@ class ChoculaDatabase: ): if dextra.get(k) is not None: extra["ia"]["sim"][k] = dextra[k] - elif drow["slug"] in ("lockss", "clockss", "portico", "jstor", "pkp_pln"): + elif drow["slug"] in ( + "lockss", + "clockss", + "portico", + "jstor", + "pkp_pln", + "hathitrust", + ): extra["kbart"] = extra.get("kbart", {}) extra["kbart"][drow["slug"]] = dict(year_spans=dextra["year_spans"]) if dextra.get("abbrev"): diff --git a/chocula/kbart.py b/chocula/kbart.py index 5fd0acc..3944430 100644 --- a/chocula/kbart.py +++ b/chocula/kbart.py @@ -1,4 +1,4 @@ -from chocula.common import KbartLoader, OnixCsvLoader +from chocula.common import KbartLoader, OnixCsvLoader, HathifilesLoader class ClockssKbartLoader(KbartLoader): @@ -41,10 +41,19 @@ class PkpPlnOnixLoader(OnixCsvLoader): return self.config.pkp_pln.filepath +class HathitrustLoader(HathifilesLoader): + + source_slug = "hathitrust" + + def file_path(self) -> str: + return self.config.hathitrust.filepath + + ALL_CHOCULA_KBART_CLASSES = [ ClockssKbartLoader, LockssKbartLoader, PorticoKbartLoader, JstorKbartLoader, PkpPlnOnixLoader, + HathitrustLoader, ] diff --git a/sources.toml b/sources.toml index faadb47..c9060f7 100644 --- a/sources.toml +++ b/sources.toml @@ -87,6 +87,12 @@ original_url = "https://www.jstor.org/kbart/collections/all-archive-titles?conte filename = "onix_pkp_pln.csv" original_url = "http://pkp.sfu.ca/files/pkppn/onix.csv" +[hathitrust] +# see notes/hathitrust.md +date = "2020-08-01" +filename = "hathi_serials.tsv" +original_url = "https://www.hathitrust.org/hathifiles" + [szczepanski] date = '2018' # Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt index 73f4629..72db455 100644 --- a/tests/files/ISSN-to-ISSN-L.txt +++ b/tests/files/ISSN-to-ISSN-L.txt @@ -366,3 +366,13 @@ ISSN ISSN-L 1715-0868 1715-0868 2215-2075 2215-2075 1988-8325 1988-8325 +0010-4523 0010-4523 +0002-161X 0002-161X +0768-5475 0768-5475 +0970-4728 0970-4728 +0372-9192 0372-9192 +0190-6313 0190-6313 +0892-8266 0892-8266 +0065-8170 0065-8170 +0068-1202 0068-1202 +0042-465x 0042-465x diff --git a/tests/files/hathi_serials.tsv b/tests/files/hathi_serials.tsv new file mode 100644 index 0000000..5200062 --- /dev/null +++ b/tests/files/hathi_serials.tsv @@ -0,0 +1,30 @@ +uva.x002429058 deny ic 000023255 v.10 1993 UVA u1340113 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2017-05-22 03:25:50 0 1993 hu scr SE UVA virginia virginia google google +uva.x001495284 deny ic 000023255 v.6-8 1984-87 UVA u1340113 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2017-05-22 03:25:50 0 1987 hu scr SE UVA virginia virginia google google +uva.x001542254 deny ic 000023255 v.3-5 1979-82 UVA u1340113 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2017-05-22 03:25:50 0 1982 hu scr SE UVA virginia virginia google google +mdp.39015022896545 deny ic 000023255 v.7-10 1985-1993 MIU 000023255 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2011-04-06 04:30:09 0 1993 hu scr SE MIU umich umich google google +mdp.39015068919623 deny ic 000023255 v.3-6 1979-1984 MIU 000023255 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2008-08-23 22:30:06 0 1984 hu scr SE MIU umich umich google google +mdp.39015068919615 deny ic 000023255 v.1-2 1975-1977 MIU 000023255 7847401 0134-045X 88648893 Etnografija južnih Slavena u Mađarskoj = Etnografija južnih Slovena u Mađarskoj. Tankönyvkiadó, 1975- bib 2012-07-25 19:30:17 0 1977 hu scr SE MIU umich umich google google +mdp.39015012888866 deny und 000024752 v.1 MIU 000024752 2428173 1054-7533 70103816 Experimental cinema. Arno Press 1969. bib 2012-06-21 19:30:09 0 1934 nyu eng SE MIU umich umich google google +wu.89098742208 deny ic 000030973 1943-1944 WU 4351521 1644142 45004730 Best film plays of ... Crown Publishers, 1944- bib 2010-04-25 19:31:25 0 1944 nyu eng SE WU wisc wisc google google +mdp.39015021085272 deny ic 000045215 v.1 1962 MIU 000045215 2083094 0010-4523 62014777 Computer applications service. American Data Processing] bib 2007-12-20 09:30:04 0 1962 miu eng SE MIU umich umich google google +mdp.39015021085280 deny und 000045215 v.2-4 MIU 000045215 2083094 0010-4523 62014777 Computer applications service. American Data Processing] bib 2009-06-12 22:30:30 0 9999 miu eng SE MIU umich umich google google +umn.31951d02948236r allow pd 000599667 v.52:no.11 (2004) UMN 002307673 1478561 0002-161X 0002-161X agr53000137 //r83 Agricultural research / U.S. Department of Agriculture. Science and Education Administration], U.S. Dept. of Agriculture : [Supt. of Docs., U.S. G.P.O., distributor, bib 2013-04-19 05:25:40 1 2004 dcu eng SE UMN umn umn google google +umn.31951001907305l allow pdus 000056498 fasc.27-28 (1921) UMN 001499770 1755884 0768-5475 Bibliothèque de la Faculté de philosophie et lettres de l'Université de Liège. Les Belles lettres, 1897- bib 2015-01-10 03:25:14 0 1921 be fre SE UMN umn umn google google +mdp.39015079651769 deny ic 003051832 v.33 no.3-4 1997-1998 MIU 003051832 26905188 0970-4728 92650249 ILA bulletin. Indian Library Association, 1975- bib 2009-09-07 22:30:12 0 1998 ii eng SE MIU umich umich google google +mdp.39015074028492 deny und 000500090 v.49 miu 000500090 1770485 0372-9192 46018870 zeitschrift für klinische medizin. springer-verlag. bib 2012-11-16 04:30:03 0 1965 gw ger se miu umich umich google google +coo.31924059002018 deny ic 000636079 78th 1991 coo 1649072 1768174 0190-6313 15026170 annual report / board of governors of the federal reserve system board of governors of the federal reserve system, [1967]- bib 2013-02-20 04:31:57 0 1991 dcu eng se coo cornell cornell google google board of governors of the federal reserve system (u.s.) +mdp.39015076621310 allow pd 000637343 v.4,7-8,14 1854,1857-1858,1863 miu 000637343 1757026 0892-8266 07041920 transactions of the medical society of the state of pennsylvania at its ... annual session. the society. bib 2013-08-09 20:26:57 0 1863 pau eng se miu umich umich lit-dlps-dc open medical society of the state of pennsylvania. +chi.72660825 allow pdus 000505286 c.1 no.3-6 chi 290494 1479819 0065-8170 52033049 memoirs of the american entomological society. american entomological society. gfv 2012-04-29 01:02:29 0 9999 pau eng se chi uchicago uchicago google google american entomological society. +mdp.39015024068416 deny ic 000528005 v.76 1991 miu 000528005 1772818 0068-1202 07036968 //r493 proceedings of the british academy. published for the british academy by g. cumberlege, oxford university press, bib 2012-06-21 19:30:18 0 1991 enk eng se miu umich umich google google +uc1.b4106780 deny ic 010010552 1961 uc .b17076882x 4128023 0042-465x sn 86012760 vestnik oftalʹmologii. medit︠s︡ina. bib 2011-07-13 20:30:35 0 1961 ru rus se nrlf universityofcalifornia universityofcalifornia google google +njp.32101077271102 allow pd 009034747 ser.3, v.7 (apr. 1905-jan. 1907) njp 628627 1481386 0003-4827 05032209 //r822 annals of iowa. iowa state historical dept., division of historical museum and archives, bib 2012-10-27 19:30:39 0 1907 iau eng se njp princeton princeton google google +mdp.39015068664518 deny ic 002866340 v.22-26 2002-2006 miu 002866340 12251337 0971-0388 87909513 the aligarh journal of statistics. dept. of statistics, aligarh muslim university, bib 2010-04-01 20:31:04 0 2006 ii eng se miu umich umich google google +uc1.$b771057 allow pd 010690843 no.660-666 1950 uc .b157201983 2636094 0041-767x 86655128 united states government publications monthly catalog / issued by the superintendent of documents. u.s. g.p.o. ; for sale by the supt. of docs., 1940-1951. bib 2020-05-29 03:25:03 1 1950 dcu eng se nrlf universityofcalifornia universityofcalifornia google google +uc1.31822009591405 deny und 000521949 v.18 uc .b27054986 3402040 0021-1311 the irish naturalists' journal. i.n.j. committee. bib 2010-08-12 20:31:43 0 9999 nik eng se ucsd universityofcalifornia universityofcalifornia google google +pst.000046424143 deny ic 012255615 1st.ed. 2000 pst a2122089 42836372 1527-4837 sn 99009894 fodor's naples and the amalfi coast. fodor's travel publications, incorporated, c2000- bib 2014-11-05 10:28:06 0 2000 xx eng se pst psu psu google google +uc1.32106007707885 deny und 000067489 v.6:1-5 uc .b15250313 8049660 0730-7004 88654892 american health. american health partners, c1982-. bib 2010-08-21 20:31:35 0 1982 eng se ucsc universityofcalifornia universityofcalifornia google google +uc1.b3540645 deny ic 000640615 v.7 (1944) uc .b126675284 1566509 0012-1223 40012538 //r52 deutsches archiv für erforschung des mittelalters. böhlau. bib 2012-07-19 04:30:58 0 1944 gw ger se nrlf universityofcalifornia universityofcalifornia google google +mdp.39015069542184 deny und 000548966 no.301-319 MIU 000548966 20910946 0369-9870 90656505 Prace Instytutu Badawczego Leśnictwa. Państwowe Wydawn. Rolnicze i Leśne, 1958-2000. bib 2010-05-24 22:30:40 0 2000 pl pol SE MIU umich umich google google +pst.000054405837 allow pd 009445702 1951-1958 PST a108888 1606613 2010229450 Evaporated, condensed, and dry milk report. Dept. of Agriculture. bib 2018-12-16 03:25:14 1 1958 dcu eng SE PST psu psu google google +mdp.39015013030138 deny ic 000546404 v.9 1981 46257-67357 MIU 000546404 3837820 0162-704X 78645419 Conference papers index. Data Courier, Inc. bib 2013-04-30 19:25:08 0 1981 kyu eng SE MIU umich umich google google +umn.31951p00389299k deny ic 002543035 no.16-24 1989-93 UMN 001000958 20951089 1050-2351 sn 90014073 Latin American population history bulletin. Dept. of History, University of Minnesota, c1989- bib 2015-02-12 03:26:52 0 1993 mnu eng SE UMN umn umn google google -- cgit v1.2.3