python/fatcat_tools/transforms.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170


import collections
from fatcat_client import ReleaseEntity, ApiClient

def entity_to_dict(entity):
    """
    Hack to take advantage of the code-generated serialization code
    """
    ac = ApiClient()
    return ac.sanitize_for_serialization(entity)

def entity_from_json(json_str, entity_type):
    """
    Hack to take advantage of the code-generated deserialization code
    """
    ac = ApiClient()
    thing = collections.namedtuple('Thing', ['data'])
    thing.data = json_str
    return ac.deserialize(thing, entity_type)

def release_to_elasticsearch(release):
    """
    Converts from an entity model/schema to elasticsearch oriented schema.

    Returns: dict
    Raises exception on error (never returns None)
    """

    if release.state in ('redirect', 'deleted'):
        return dict(
            ident = release.ident,
            state = release.state,
        )
    elif release.state != 'active':
        raise ValueError("Unhandled release state: {}".format(release.state))

    # First, the easy ones (direct copy)
    t = dict(
        ident = release.ident,
        state = release.state,
        revision = release.revision,
        title = release.title,
        original_title = release.original_title,
        release_type = release.release_type,
        release_status = release.release_status,
        language = release.language,
        license = release.license_slug,
        doi = release.doi,
        pmid = release.pmid,
        pmcid = release.pmcid,
        isbn13 = release.isbn13,
        wikidata_qid = release.wikidata_qid,
        core_id = release.core_id,
        arxiv_id = release.core_id,
        jstor_id = release.jstor_id,
    )

    is_oa = None
    is_longtail_oa = None
    in_kbart = None
    in_web = False
    in_dweb = False
    in_ia = False
    in_shadow = False

    if release.release_date:
        # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
        t['release_date'] = release.release_date.isoformat()
        if release.release_year is None:
            t['release_year'] = release.release_date.year
    if release.release_year is not None:
        t['release_year'] = release.release_year

    t['any_abstract'] = len(release.abstracts) > 0
    t['ref_count'] = len(release.refs or [])
    t['contrib_count'] = len(release.contribs or [])
    contrib_names = []
    for c in (release.contribs or []):
        if c.raw_name:
            contrib_names.append(c.raw_name)
    t['contrib_names'] = contrib_names

    container = release.container
    if container:
        t['publisher'] = container.publisher
        t['container_name'] = container.name
        t['container_issnl'] = container.issnl
        t['container_type'] = container.container_type
        if container.extra:
            if container.extra.get('is_oa') or container.extra.get('in_doaj'):
                is_oa = True
            if container.extra.get('in_kbart'):
                # TODO: better KBART check goes here
                in_kbart = True
            if container.extra.get('ia'):
                # TODO: container longtail check goes here
                # TODO: sim/microfilm check goes here
                pass
            # TODO: SHERPA/Romeo goes here
    else:
        t['publisher'] = release.publisher

    files = release.files or []
    t['file_count'] = len(files)
    t['fileset_count'] = len(release.filesets or [])
    t['webcapture_count'] = len(release.webcaptures or [])
    any_pdf_url = None
    good_pdf_url = None
    best_pdf_url = None
    ia_pdf_url = None
    for f in files:
        if f.extra and f.extra.get('shadows'):
            # TODO: shadow check goes here
            in_shadows = True
        is_pdf = 'pdf' in (f.mimetype or '')
        for url in (f.urls or []):
            if url.url.lower().startswith('http'):
                in_web = True
            if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
                # TODO: not sure what rel will be
                in_dweb = True
            if is_pdf:
                any_pdf_url = url.url
            if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
                is_preserved = True
                good_pdf_url = url.url
            if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
                in_ia = True
                if is_pdf:
                    best_pdf_url = url.url
                    ia_pdf_url = url.url
    # here is where we bake-in priority; IA-specific
    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
    t['ia_pdf_url'] = ia_pdf_url

    if release.license_slug:
        # TODO: more/better checks here, particularly strict *not* OA licenses
        if release.license_slug.startswith("CC-"):
            is_oa = True

    extra = release.extra or dict()
    if extra:
        # TODO: longtail OA check from GROBID here
        if extra.get('in_kbart'):
            # NOTE: not actually setting this anywhere
            in_kbart = True
        if extra.get('is_oa'):
            # NOTE: not actually setting this anywhere
            is_oa = True
        if extra.get('grobid'):
            if not t.get('container_name'):
                t['container_name'] = extra['grobid'].get('container_name')
            if extra['grobid'].get('longtail_oa'):
                is_longtail_oa = True
        if extra.get('crossref'):
            if extra['crossref'].get('archive'):
                # all crossref archives are KBART, I believe
                in_kbart = True

    if is_longtail_oa:
        is_oa = True
    t['is_oa'] = is_oa
    t['is_longtail_oa'] = is_longtail_oa
    t['in_kbart'] = in_kbart
    t['in_web'] = in_web
    t['in_dweb'] = in_dweb
    t['in_ia'] = in_ia
    t['is_preserved'] = in_ia or in_kbart
    return t