chocula/directories/doaj.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

from typing import Iterable, Optional, Dict, Any
import csv

from chocula.util import (
    clean_str,
    parse_mimetypes,
    parse_country,
    parse_lang,
    PLATFORM_MAP,
)
from chocula.common import DirectoryLoader
from chocula.database import DirectoryInfo, HomepageUrl


class DoajLoader(DirectoryLoader):
    """
    CSV Columns:

    - Journal title
    - Journal URL
    - Alternative title
    - Journal ISSN (print version)
    - Journal EISSN (online version)
    - Publisher
    - Society or institution
    - "Platform
    - host or aggregator"
    - Country of publisher
    - Journal article processing charges (APCs)
    - APC information URL
    - APC amount
    - Currency
    - Journal article submission fee
    - Submission fee URL
    - Submission fee amount
    - Submission fee currency
    - Number of articles publish in the last calendar year
    - Number of articles information URL
    - Journal waiver policy (for developing country authors etc)
    - Waiver policy information URL
    - Digital archiving policy or program(s)
    - Archiving: national library
    - Archiving: other
    - Archiving infomation URL
    - Journal full-text crawl permission
    - Permanent article identifiers
    - Journal provides download statistics
    - Download statistics information URL
    - First calendar year journal provided online Open Access content
    - Full text formats
    - Keywords
    - Full text language
    - URL for the Editorial Board page
    - Review process
    - Review process information URL
    - URL for journal's aims & scope
    - URL for journal's instructions for authors
    - Journal plagiarism screening policy
    - Plagiarism information URL
    - Average number of weeks between submission and publication
    - URL for journal's Open Access statement
    - Machine-readable CC licensing information embedded or displayed in articles
    - URL to an example page with embedded licensing information
    - Journal license
    - License attributes
    - URL for license terms
    - Does this journal allow unrestricted reuse in compliance with BOAI?
    - Deposit policy directory
    - Author holds copyright without restrictions
    - Copyright information URL
    - Author holds publishing rights without restrictions
    - Publishing rights information URL
    - DOAJ Seal
    - Tick: Accepted after March 2014
    - Added on Date
    - Subjects
    """

    source_slug = "doaj"

    def open_file(self) -> Iterable:
        return csv.DictReader(open(self.config.doaj.filepath))

    def parse_record(self, row) -> Optional[DirectoryInfo]:
        # TODO: Subjects, Permanent article identifiers, work_level stuff

        info = DirectoryInfo(
            directory_slug=self.source_slug,
            issnp=row["Journal ISSN (print version)"],
            issne=row["Journal EISSN (online version)"],
            name=clean_str(row["Journal title"]),
            publisher=clean_str(row["Publisher"]),
            platform=PLATFORM_MAP.get(row["Platform, host or aggregator"]),
            country=parse_country(row["Country of publisher"]),
        )

        lang = parse_lang(row["Full text language"])
        if lang:
            info.langs.append(lang)

        extra: Dict[str, Any] = dict(doaj=dict())
        extra["mimetypes"] = parse_mimetypes(row["Full text formats"])
        extra["doaj"]["as_of"] = self.config.snapshot.date
        if row["DOAJ Seal"]:
            extra["doaj"]["seal"] = {"no": False, "yes": True}[row["DOAJ Seal"].lower()]

        if row["Digital archiving policy or program(s)"]:
            extra["archive"] = [
                a.strip()
                for a in row["Digital archiving policy or program(s)"].split(",")
                if a.strip()
            ]
        elif row["Archiving: national library"]:
            extra["archive"] = ["national-library"]

        crawl_permission = row["Journal full-text crawl permission"]
        if crawl_permission:
            extra["crawl-permission"] = dict(Yes=True, No=False)[crawl_permission]
        default_license = row["Journal license"]
        if default_license and default_license.startswith("CC"):
            extra["default_license"] = default_license.replace("CC ", "CC-").strip()

        url = row["Journal URL"]
        if url:
            homepage = HomepageUrl.from_url(row["Journal URL"])
            if homepage:
                info.homepage_urls.append(homepage)
        return info