python/fatcat_tools/importers/journal_metadata.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183


import sys
import json
import itertools
import fatcat_client
from .common import EntityImporter, clean


def or_none(s):
    if s is None:
        return None
    if len(s) == 0:
        return None
    return s

def truthy(s):
    if s is None:
        return None
    s = s.lower()

    if s in ('true', 't', 'yes', 'y', '1'):
        return True
    elif s in ('false', 'f', 'no', 'n', '0'):
        return False
    else:
        return None

class JournalMetadataImporter(EntityImporter):
    """
    Imports journal metadata ("containers") by ISSN, currently from a custom
    (data munged) .csv file format

    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):

        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count


    'extra' fields:

        doaj
            as_of: datetime of most recent check; if not set, not actually in DOAJ
            seal: bool
            work_level: bool (are work-level publications deposited with DOAJ?)
            archiving: array, can include 'library' or 'other'
        road
            as_of: datetime of most recent check; if not set, not actually in ROAD
        pubmed (TODO: delete?)
            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
        norwegian (TODO: drop this?)
            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
            id (integer)
            level (integer; 0-2)
        kbart
            lockss
                year_rle
                volume_rle
            portico
                ...
            clockss
                ...
        sherpa_romeo
            color
        jstor
            year_rle
            volume_rle
        scopus
            id
            TODO: print/electronic distinction?
        wos
            id
        doi
            crossref_doi: DOI of the title in crossref (if exists)
            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
        ia
            sim
                nap_id
                year_rle
                volume_rle
            longtail: boolean
            homepage
                as_of: datetime of last attempt
                url
                status: HTTP/heritrix status of homepage crawl

        issnp: string
        issne: string
        coden: string
        abbrev: string
        oclc_id: string (TODO: lookup?)
        lccn_id: string (TODO: lookup?)
        dblb_id: string
        default_license: slug
        original_name: native name (if name is translated)
        platform: hosting platform: OJS, wordpress, scielo, etc
        mimetypes: array of strings (eg, 'application/pdf', 'text/html')
        first_year: year (integer)
        last_year: if publishing has stopped
        primary_language: single ISO code, or 'mixed'
        languages: array of ISO codes
        region: TODO: continent/world-region
        nation: shortcode of nation
        discipline: TODO: highest-level subject; "life science", "humanities", etc
        field: TODO: narrower description of field
        subjects: TODO?
        url: homepage
        is_oa: boolean. If true, can assume all releases under this container are "Open Access"
        TODO: domains, if exclusive?
        TODO: fulltext_regex, if a known pattern?

    For KBART, etc:
        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
        year and volume spans are run-length-encoded arrays, using integers:
            - if an integer, means that year is preserved
            - if an array of length 2, means everything between the two numbers (inclusive) is preserved
    """

    def __init__(self, api, **kwargs):

        eg_desc = kwargs.get('editgroup_description',
            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
        eg_extra = kwargs.get('editgroup_extra', dict())
        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
        super().__init__(api,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra)

    def want(self, raw_record):
        if raw_record.get('ISSN-L'):
            return True
        return False

    def parse_record(self, row):
        """
        row is a python dict (parsed from CSV).
        returns a ContainerEntity (or None if invalid or couldn't parse)
        """
        title = or_none(row['title'])
        issnl = or_none(row['ISSN-L'])
        if title is None or issnl is None:
            return None
        extra = dict(
            in_doaj=truthy(row['in_doaj']),
            in_road=truthy(row['in_road']),
            in_norwegian=truthy(row['in_norwegian']),
            language=or_none(row['lang']),
            url=or_none(row['url']),
            ISSNp=or_none(row['ISSN-print']),
            ISSNe=or_none(row['ISSN-electronic']),
            is_oa=truthy(row['is_oa']),
            is_kept=truthy(row['is_kept']),
        )
        ce = fatcat_client.ContainerEntity(
            issnl=issnl,
            name=clean(title),
            publisher=or_none(clean(row['publisher'])),
            extra=extra)
        return ce

    def try_update(self, ce):

        existing = None
        try:
            existing = self.api.lookup_container(issnl=ce.issnl)
        except fatcat_client.rest.ApiException as err:
            if err.status != 404:
                raise err
            # doesn't exist, need to update
            return True

        # eventually we'll want to support "updates", but for now just skip if
        # entity already exists
        if existing:
            self.counts['exists'] += 1
            return False
        
        return True

    def insert_batch(self, batch):
        self.api.create_container_batch(batch,
            autoaccept=True,
            description=self.editgroup_description,
            extra=json.dumps(self.editgroup_extra))