chocula/directories/issn_meta.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

from typing import Iterable, Optional
import json

from chocula.util import clean_str, clean_issn, parse_country
from chocula.common import DirectoryLoader
from chocula.database import DirectoryInfo, HomepageUrl


class IssnMetaLoader(DirectoryLoader):
    """
    This is JSON-LD (-ish) scraped from portal.issn.org, filtered down to only
    journals already in the corpus, or matching a couple other criteria.

    Metadata we expect to get:

    - high quality English title
    - URLs
    - country

    TODO: non-english alternative titles
    """

    source_slug = "issn_meta"

    def open_file(self) -> Iterable:
        return open(self.config.issn_meta.filepath, "r")

    def parse_record(self, row) -> Optional[DirectoryInfo]:

        row = json.loads(row)

        info = DirectoryInfo(directory_slug=self.source_slug,)
        # format is an array of metadata elements
        for el in row:
            if "label" in el and el["@id"].startswith(
                "http://id.loc.gov/vocabulary/countries"
            ):
                value = el["label"]
                if "(State)" in value:
                    value = ""
                if value == "Russia (Federation)":
                    value = "Russia"
                info.country = parse_country(el["label"])
            if not "@type" in el:
                continue
            if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL":
                info.issnl = clean_issn(el["value"])
            if "mainTitle" in el:
                if type(el["mainTitle"]) == list:
                    info.name = clean_str(el["mainTitle"][0])
                else:
                    info.name = clean_str(el["mainTitle"])
                if el.get("format") == "vocabularies/medium#Print":
                    info.issnp = clean_issn(el["issn"])
                elif el.get("format") == "vocabularies/medium#Electronic":
                    info.issne = clean_issn(el["issn"])
            urls = el.get("url", [])
            if isinstance(urls, str):
                urls = [
                    urls,
                ]
            for url in urls:
                homepage = HomepageUrl.from_url(url)
                if homepage:
                    info.homepage_urls.append(homepage)

        return info