from typing import Iterable, Optional import json from chocula.util import clean_str, clean_issn, parse_country from chocula.common import DirectoryLoader from chocula.database import DirectoryInfo, HomepageUrl class IssnMetaLoader(DirectoryLoader): """ This is JSON-LD (-ish) scraped from portal.issn.org, filtered down to only journals already in the corpus, or matching a couple other criteria. Metadata we expect to get: - high quality English title - URLs - country TODO: non-english alternative titles """ source_slug = "issn_meta" def open_file(self) -> Iterable: return open(self.config.issn_meta.filepath, "r") def parse_record(self, row) -> Optional[DirectoryInfo]: row = json.loads(row) info = DirectoryInfo(directory_slug=self.source_slug,) # format is an array of metadata elements for el in row: if ( "value" in el and el["@id"].startswith("http://id.loc.gov/vocabulary/countries") and len(el["@id"].split("/")[-1]) == 2 ): info.country = parse_country(el["value"]) if not "@type" in el: continue if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL": info.issnl = clean_issn(el["value"]) if "mainTitle" in el: info.name = clean_str(el["mainTitle"]) if el.get("format") == "vocabularies/medium#Print": info.issnp = clean_issn(el["issn"]) elif el.get("format") == "vocabularies/medium#Electronic": info.issne = clean_issn(el["issn"]) urls = el.get("url", []) if isinstance(urls, str): urls = [ urls, ] for url in urls: homepage = HomepageUrl.from_url(url) if homepage: info.homepage_urls.append(homepage) return info