diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 22:12:03 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 22:12:03 -0700 |
commit | 2dec107e37280b55dddae74cd0328f2d5c7979b6 (patch) | |
tree | 3083bad52ef8814368c5d6cadf2ccdb89266e764 /chocula/directories/vanished_inactive.py | |
parent | 043b35040e4385c674267aa88c4056bdfdd9cb6c (diff) | |
download | chocula-2dec107e37280b55dddae74cd0328f2d5c7979b6.tar.gz chocula-2dec107e37280b55dddae74cd0328f2d5c7979b6.zip |
update vanished journal importer for 2020-09-03 dataset
Diffstat (limited to 'chocula/directories/vanished_inactive.py')
-rw-r--r-- | chocula/directories/vanished_inactive.py | 32 |
1 files changed, 13 insertions, 19 deletions
diff --git a/chocula/directories/vanished_inactive.py b/chocula/directories/vanished_inactive.py index 253940c..8b23525 100644 --- a/chocula/directories/vanished_inactive.py +++ b/chocula/directories/vanished_inactive.py @@ -1,9 +1,9 @@ import csv from typing import Iterable, Optional -from chocula.util import clean_str, clean_issn, parse_lang, parse_country +from chocula.util import clean_str, clean_issn from chocula.common import DirectoryLoader -from chocula.database import DirectoryInfo +from chocula.database import DirectoryInfo, HomepageUrl class VanishedInactiveLoader(DirectoryLoader): @@ -13,38 +13,32 @@ class VanishedInactiveLoader(DirectoryLoader): CSV headers: - - Source - Title - - Identifier - - Publisher - - Comment - - Language + - URL - ISSN - EISSN - - Keyword - - Start Year - - End Year - - Added on date - - Subjects - - Country - - Publication fee - - Further Information """ source_slug = "vanished_inactive" def open_file(self) -> Iterable: - return csv.DictReader(open(self.config.vanished_inactive.filepath)) + return csv.DictReader(open(self.config.vanished_inactive.filepath), delimiter=";") def parse_record(self, record) -> Optional[DirectoryInfo]: + # HACK + record["Title"] = record["\ufeffTitle"] + if not record["Title"]: + return None + info = DirectoryInfo( directory_slug=self.source_slug, raw_issn=clean_issn(record["ISSN"]), issne=clean_issn(record["EISSN"]), name=clean_str(record["Title"]), - publisher=clean_str(record["Publisher"]), - langs=[lang for lang in [parse_lang(record["Language"])] if lang], - country=parse_country(record["Country"]), ) + + homepage = HomepageUrl.from_url(record["URL"]) + if homepage: + info.homepage_urls.append(homepage) return info |