From 571b1f77e9375c7bab5ccbe8ae41c60dd2c64779 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 17:23:13 -0700 Subject: ISSN portal metadata directory importer --- chocula/directories/issn_meta.py | 61 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 chocula/directories/issn_meta.py (limited to 'chocula') diff --git a/chocula/directories/issn_meta.py b/chocula/directories/issn_meta.py new file mode 100644 index 0000000..e6abb3c --- /dev/null +++ b/chocula/directories/issn_meta.py @@ -0,0 +1,61 @@ +from typing import Iterable, Optional +import json + +from chocula.util import clean_str, clean_issn, parse_country +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class IssnMetaLoader(DirectoryLoader): + """ + This is JSON-LD (-ish) scraped from portal.issn.org, filtered down to only + journals already in the corpus, or matching a couple other criteria. + + Metadata we expect to get: + + - high quality English title + - URLs + - country + + TODO: non-english alternative titles + """ + + source_slug = "issn_meta" + + def open_file(self) -> Iterable: + return open(self.config.issn_meta.filepath, "r") + + def parse_record(self, row) -> Optional[DirectoryInfo]: + + row = json.loads(row) + + info = DirectoryInfo(directory_slug=self.source_slug,) + # format is an array of metadata elements + for el in row: + if ( + "value" in el + and el["@id"].startswith("http://id.loc.gov/vocabulary/countries") + and len(el["@id"].split("/")[-1]) == 2 + ): + info.country = parse_country(el["value"]) + if not "@type" in el: + continue + if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL": + info.issnl = clean_issn(el["value"]) + if "mainTitle" in el: + info.name = clean_str(el["mainTitle"]) + if el.get("format") == "vocabularies/medium#Print": + info.issnp = clean_issn(el["issn"]) + elif el.get("format") == "vocabularies/medium#Electronic": + info.issne = clean_issn(el["issn"]) + urls = el.get("url", []) + if isinstance(urls, str): + urls = [ + urls, + ] + for url in urls: + homepage = HomepageUrl.from_url(url) + if homepage: + info.homepage_urls.append(homepage) + + return info -- cgit v1.2.3