diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 17:23:13 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 17:23:13 -0700 |
commit | 571b1f77e9375c7bab5ccbe8ae41c60dd2c64779 (patch) | |
tree | 3f94882b2bbbd3b5122603fc38b7b3fbf718ddc3 /chocula | |
parent | a4f9030d1aa49e18e699fcf37d336fa2f03f804c (diff) | |
download | chocula-571b1f77e9375c7bab5ccbe8ae41c60dd2c64779.tar.gz chocula-571b1f77e9375c7bab5ccbe8ae41c60dd2c64779.zip |
ISSN portal metadata directory importer
Diffstat (limited to 'chocula')
-rw-r--r-- | chocula/directories/issn_meta.py | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/chocula/directories/issn_meta.py b/chocula/directories/issn_meta.py new file mode 100644 index 0000000..e6abb3c --- /dev/null +++ b/chocula/directories/issn_meta.py @@ -0,0 +1,61 @@ +from typing import Iterable, Optional +import json + +from chocula.util import clean_str, clean_issn, parse_country +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class IssnMetaLoader(DirectoryLoader): + """ + This is JSON-LD (-ish) scraped from portal.issn.org, filtered down to only + journals already in the corpus, or matching a couple other criteria. + + Metadata we expect to get: + + - high quality English title + - URLs + - country + + TODO: non-english alternative titles + """ + + source_slug = "issn_meta" + + def open_file(self) -> Iterable: + return open(self.config.issn_meta.filepath, "r") + + def parse_record(self, row) -> Optional[DirectoryInfo]: + + row = json.loads(row) + + info = DirectoryInfo(directory_slug=self.source_slug,) + # format is an array of metadata elements + for el in row: + if ( + "value" in el + and el["@id"].startswith("http://id.loc.gov/vocabulary/countries") + and len(el["@id"].split("/")[-1]) == 2 + ): + info.country = parse_country(el["value"]) + if not "@type" in el: + continue + if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL": + info.issnl = clean_issn(el["value"]) + if "mainTitle" in el: + info.name = clean_str(el["mainTitle"]) + if el.get("format") == "vocabularies/medium#Print": + info.issnp = clean_issn(el["issn"]) + elif el.get("format") == "vocabularies/medium#Electronic": + info.issne = clean_issn(el["issn"]) + urls = el.get("url", []) + if isinstance(urls, str): + urls = [ + urls, + ] + for url in urls: + homepage = HomepageUrl.from_url(url) + if homepage: + info.homepage_urls.append(homepage) + + return info |