From 427f25fb9a362348df644afae2f56124634ca67d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 17:23:37 -0700 Subject: vanished journal metadata importer --- chocula/directories/vanished_disapeared.py | 63 ++++++++++++++++++++++++++++++ chocula/directories/vanished_inactive.py | 50 ++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 chocula/directories/vanished_disapeared.py create mode 100644 chocula/directories/vanished_inactive.py (limited to 'chocula') diff --git a/chocula/directories/vanished_disapeared.py b/chocula/directories/vanished_disapeared.py new file mode 100644 index 0000000..a5e4c38 --- /dev/null +++ b/chocula/directories/vanished_disapeared.py @@ -0,0 +1,63 @@ +import csv +from typing import Iterable, Optional + +from chocula.util import clean_str, clean_issn, parse_lang, parse_country +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class VanishedDisapearedLoader(DirectoryLoader): + """ + Journal-level metadata from the "Vanished Journals" project. This is the + "disapeared" dataset, with many homepage URLs in wayback (web.archive.org). + + CSV headers: + - Source + - If Identified by second source + - Journal Name + - ISSN + - E-ISSN + - URL + - Publisher + - blank + - Language(s) + - Country + - society_affiliation + - other_sci_affiliation + - Discipline + - Discipline Group + - Start Year + - End Year + - Last Year Online + - Actively Publishing + - Internet Archive Link + - Verified + - Comments + - The Keepers (archived) + - Archive Link + - Mikael (1 = agree with Lisa) + """ + + source_slug = "vanished_disapeared" + + def open_file(self) -> Iterable: + return csv.DictReader(open(self.config.vanished_disapeared.filepath)) + + def parse_record(self, record) -> Optional[DirectoryInfo]: + + info = DirectoryInfo( + directory_slug=self.source_slug, + raw_issn=clean_issn(record["ISSN"]), + issne=clean_issn(record["E-ISSN"]), + name=clean_str(record["Journal Name"]), + publisher=clean_str(record["Publisher"]), + langs=[parse_lang(record["Language(s)"])], + country=parse_country(record["Country"]), + ) + homepage = HomepageUrl.from_url(record["Internet Archive Link"]) + if homepage: + info.homepage_urls.append(homepage) + homepage = HomepageUrl.from_url(record["Archive Link"]) + if homepage: + info.homepage_urls.append(homepage) + return info diff --git a/chocula/directories/vanished_inactive.py b/chocula/directories/vanished_inactive.py new file mode 100644 index 0000000..7996084 --- /dev/null +++ b/chocula/directories/vanished_inactive.py @@ -0,0 +1,50 @@ +import csv +from typing import Iterable, Optional + +from chocula.util import clean_str, clean_issn, parse_lang, parse_country +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo + + +class VanishedInactiveLoader(DirectoryLoader): + """ + Journal-level metadata from the "Vanished Journals" project. This is the + "inactive" dataset. + + CSV headers: + + - Source + - Title + - Identifier + - Publisher + - Comment + - Language + - ISSN + - EISSN + - Keyword + - Start Year + - End Year + - Added on date + - Subjects + - Country + - Publication fee + - Further Information + """ + + source_slug = "vanished_inactive" + + def open_file(self) -> Iterable: + return csv.DictReader(open(self.config.vanished_inactive.filepath)) + + def parse_record(self, record) -> Optional[DirectoryInfo]: + + info = DirectoryInfo( + directory_slug=self.source_slug, + raw_issn=clean_issn(record["ISSN"]), + issne=clean_issn(record["EISSN"]), + name=clean_str(record["Title"]), + publisher=clean_str(record["Publisher"]), + langs=[parse_lang(record["Language"])], + country=parse_country(record["Country"]), + ) + return info -- cgit v1.2.3