diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 11:39:10 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 11:41:49 -0700 |
commit | 9f53eb4c4fd4030965fe004184c803b41fa49b04 (patch) | |
tree | 70532f2f32efdcb5326f7844560e6b3e8a5441f1 /chocula | |
parent | cd7fce808e60c09d184b7ec7e72570494a87d399 (diff) | |
download | chocula-9f53eb4c4fd4030965fe004184c803b41fa49b04.tar.gz chocula-9f53eb4c4fd4030965fe004184c803b41fa49b04.zip |
new manual homepage source
Diffstat (limited to 'chocula')
-rw-r--r-- | chocula/directories/__init__.py | 2 | ||||
-rw-r--r-- | chocula/directories/manual_homepages.py | 47 |
2 files changed, 49 insertions, 0 deletions
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py index 90e6f26..6f32c8e 100644 --- a/chocula/directories/__init__.py +++ b/chocula/directories/__init__.py @@ -11,6 +11,7 @@ from chocula.directories.sim import SimLoader from chocula.directories.scielo import ScieloLoader from chocula.directories.szczepanski import SzczepanskiLoader from chocula.directories.wikidata import WikidataLoader +from chocula.directories.manual_homepages import ManualHomepageLoader ALL_CHOCULA_DIR_CLASSES = [ CrossrefLoader, @@ -26,4 +27,5 @@ ALL_CHOCULA_DIR_CLASSES = [ WikidataLoader, SimLoader, ScieloLoader, + ManualHomepageLoader, ] diff --git a/chocula/directories/manual_homepages.py b/chocula/directories/manual_homepages.py new file mode 100644 index 0000000..3f84794 --- /dev/null +++ b/chocula/directories/manual_homepages.py @@ -0,0 +1,47 @@ +import csv +from typing import Iterable, Optional + +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class ManualHomepageLoader(DirectoryLoader): + + source_slug = "manual_homepages" + + def open_file(self) -> Iterable: + return csv.DictReader( + open(self.config.manual_homepages.filepath), delimiter="\t", + ) + + def parse_record(self, record) -> Optional[DirectoryInfo]: + """ + Most of this metadata comes from chocula/fatcat; we are only interested + in the homepage URLs. + + The "corrected titles" have been manually entered into fatcat directly. + + CSV columns: + - issnl + - issnp + - issne + - name + - corrected title + - publisher + - country + - lang + - release_count + - Homepage URL + - Inactive + """ + + info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issnl"],) + url = record["Homepage URL"] + if url is None or url.lower() == "unknown" or len(url) < 4: + return None + homepage = HomepageUrl.from_url(url) + if homepage: + info.homepage_urls.append(homepage) + if homepage is None: + return None + return info |