chocula/directories/sim.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71


from typing import Iterable, Optional, Dict, Any
import csv

from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP, gaps_to_spans
from chocula.common import DirectoryLoader
from chocula.database import DirectoryInfo, HomepageUrl


class SimLoader(DirectoryLoader):

    source_slug = "sim"

    def open_file(self) -> Iterable:
        return csv.DictReader(open(self.config.sim.filepath))

    def parse_record(self, row) -> Optional[DirectoryInfo]:

        """
        NA Pub Cat ID
        Title
        Publisher
        ISSN
        Impact Rank
        Total Cities
        Journal Impact Factor
        Eigenfact or Score
        First Volume
        Last Volume
        NA Gaps
        "Scholarly / Peer-\n Reviewed"
        "Peer-\n Reviewed"
        Pub Type
        Pub Language
        Subjects
        """
        # TODO: 'Pub Type'

        extra: Dict[str, Any] = {}
        first_year = row['First Volume']
        if first_year:
            first_year = int(first_year)
            extra['first_year'] = int(row['First Volume'])
        else:
            first_year = None
        last_year = row['Last Volume']
        if last_year:
            last_year = int(last_year)
            extra['last_year'] = last_year
        else:
            last_year = None
        gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
        if gaps:
            extra['gaps'] = gaps
        if first_year and last_year:
            extra['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
        extra['scholarly_peer_reviewed'] = row["Scholarly / Peer-\nReviewed"]
        extra['peer_reviewed'] = row["Peer-\nReviewed"]
        extra['pub_type'] = clean_str(row["Pub Type"])

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            name=clean_str(row['Title']),
            publisher=clean_str(row['Publisher']),
            raw_issn=row['ISSN'][:9],
            custom_id=row.get('NA Pub Cat ID').strip() or None,
            langs=[parse_lang(row['Pub Language'])],
            extra=extra,
        )
        return info