ISSN portal metadata directory importer

author: Bryan Newbold <bnewbold@archive.org> 2020-06-23 17:23:13 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-06-23 17:23:13 -0700
commit: 571b1f77e9375c7bab5ccbe8ae41c60dd2c64779 (patch)
tree: 3f94882b2bbbd3b5122603fc38b7b3fbf718ddc3 /chocula
parent: a4f9030d1aa49e18e699fcf37d336fa2f03f804c (diff)
download: chocula-571b1f77e9375c7bab5ccbe8ae41c60dd2c64779.tar.gz
chocula-571b1f77e9375c7bab5ccbe8ae41c60dd2c64779.zip
1 files changed, 61 insertions, 0 deletions
diff --git a/chocula/directories/issn_meta.py b/chocula/directories/issn_meta.py
new file mode 100644
index 0000000..e6abb3c
--- /dev/null
+++ b/chocula/directories/issn_meta.py
@@ -0,0 +1,61 @@
+from typing import Iterable, Optional
+import json
+
+from chocula.util import clean_str, clean_issn, parse_country
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class IssnMetaLoader(DirectoryLoader):
+    """
+    This is JSON-LD (-ish) scraped from portal.issn.org, filtered down to only
+    journals already in the corpus, or matching a couple other criteria.
+
+    Metadata we expect to get:
+
+    - high quality English title
+    - URLs
+    - country
+
+    TODO: non-english alternative titles
+    """
+
+    source_slug = "issn_meta"
+
+    def open_file(self) -> Iterable:
+        return open(self.config.issn_meta.filepath, "r")
+
+    def parse_record(self, row) -> Optional[DirectoryInfo]:
+
+        row = json.loads(row)
+
+        info = DirectoryInfo(directory_slug=self.source_slug,)
+        # format is an array of metadata elements
+        for el in row:
+            if (
+                "value" in el
+                and el["@id"].startswith("http://id.loc.gov/vocabulary/countries")
+                and len(el["@id"].split("/")[-1]) == 2
+            ):
+                info.country = parse_country(el["value"])
+            if not "@type" in el:
+                continue
+            if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL":
+                info.issnl = clean_issn(el["value"])
+            if "mainTitle" in el:
+                info.name = clean_str(el["mainTitle"])
+                if el.get("format") == "vocabularies/medium#Print":
+                    info.issnp = clean_issn(el["issn"])
+                elif el.get("format") == "vocabularies/medium#Electronic":
+                    info.issne = clean_issn(el["issn"])
+            urls = el.get("url", [])
+            if isinstance(urls, str):
+                urls = [
+                    urls,
+                ]
+            for url in urls:
+                homepage = HomepageUrl.from_url(url)
+                if homepage:
+                    info.homepage_urls.append(homepage)
+
+        return info
author	Bryan Newbold <bnewbold@archive.org>	2020-06-23 17:23:13 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-06-23 17:23:13 -0700
commit	571b1f77e9375c7bab5ccbe8ae41c60dd2c64779 (patch)
tree	3f94882b2bbbd3b5122603fc38b7b3fbf718ddc3 /chocula
parent	a4f9030d1aa49e18e699fcf37d336fa2f03f804c (diff)
download	chocula-571b1f77e9375c7bab5ccbe8ae41c60dd2c64779.tar.gz chocula-571b1f77e9375c7bab5ccbe8ae41c60dd2c64779.zip