aboutsummaryrefslogtreecommitdiffstats
path: root/chocula/directories/manual_homepages.py
blob: 3f84794b94b01527e4ecd36b45451449d5e86e2d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import csv
from typing import Iterable, Optional

from chocula.common import DirectoryLoader
from chocula.database import DirectoryInfo, HomepageUrl


class ManualHomepageLoader(DirectoryLoader):

    source_slug = "manual_homepages"

    def open_file(self) -> Iterable:
        return csv.DictReader(
            open(self.config.manual_homepages.filepath), delimiter="\t",
        )

    def parse_record(self, record) -> Optional[DirectoryInfo]:
        """
        Most of this metadata comes from chocula/fatcat; we are only interested
        in the homepage URLs.

        The "corrected titles" have been manually entered into fatcat directly.

        CSV columns:
        - issnl
        - issnp
        - issne
        - name
        - corrected title
        - publisher
        - country
        - lang
        - release_count
        - Homepage URL
        - Inactive
        """

        info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issnl"],)
        url = record["Homepage URL"]
        if url is None or url.lower() == "unknown" or len(url) < 4:
            return None
        homepage = HomepageUrl.from_url(url)
        if homepage:
            info.homepage_urls.append(homepage)
        if homepage is None:
            return None
        return info