summaryrefslogtreecommitdiffstats
path: root/python/fatcat_web/entity_helpers.py
blob: b04be55c690f55208714f0ac48ad499675d1f331 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

from flask import abort
from fatcat_openapi_client.rest import ApiException, ApiValueError
from fatcat_tools.transforms import *
from fatcat_web import app, api
from fatcat_web.search import get_elastic_container_stats, get_elastic_container_random_releases
from fatcat_web.hacks import strip_extlink_xml, wayback_suffix

def enrich_container_entity(entity):
    if entity.state in ('redirect', 'deleted'):
        return entity
    if entity.state == "active":
        entity._es = container_to_elasticsearch(entity, force_bool=False)
    entity._stats = None
    try:
        entity._stats = get_elastic_container_stats(entity.ident, issnl=entity.issnl)
        #if entity._stats['total'] > 0:
        entity._random_releases = get_elastic_container_random_releases(entity.ident)
    except Exception as e:
        app.log.error(e)
        pass
    return entity

def enrich_creator_entity(entity):
    if entity.state in ('redirect', 'deleted'):
        return entity
    entity._releases = None
    if entity.state in ('active', 'wip'):
        entity._releases = api.get_creator_releases(entity.ident)
    return entity

def enrich_file_entity(entity):
    return entity

def enrich_fileset_entity(entity):
    if entity.state in ('redirect', 'deleted'):
        return entity
    entity._total_size = None
    if entity.manifest != None:
        entity._total_size = sum([f.size for f in entity.manifest]) or 0
    return entity

def enrich_webcapture_entity(entity):
    if entity.state in ('redirect', 'deleted'):
        return entity
    entity._wayback_suffix = wayback_suffix(entity)
    return entity

def enrich_release_entity(entity):
    if entity.state in ('redirect', 'deleted'):
        return entity
    if entity.state == "active":
        entity._es = release_to_elasticsearch(entity, force_bool=False)
    if entity.container and entity.container.state == "active":
        entity.container._es = container_to_elasticsearch(entity.container, force_bool=False)
    if entity.files:
        # remove shadows-only files with no URLs
        entity.files = [f for f in entity.files
            if not (f.extra and f.extra.get('shadows') and not f.urls)]
    if entity.filesets:
        for fs in entity.filesets:
            fs._total_size = sum([f.size for f in fs.manifest])
    if entity.webcaptures:
        for wc in entity.webcaptures:
            wc._wayback_suffix = wayback_suffix(wc)
    for ref in entity.refs:
        # this is a UI hack to get rid of XML crud in unstructured refs like:
        # LOCKSS (2014) Available: <ext-link
        # xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri"
        # xlink:href="http://lockss.org/"
        # xlink:type="simple">http://lockss.org/</ext-link>. Accessed: 2014
        # November 1.
        if ref.extra and ref.extra.get('unstructured'):
            ref.extra['unstructured'] = strip_extlink_xml(ref.extra['unstructured'])
    # author list to display; ensure it's sorted by index (any othors with
    # index=None go to end of list)
    authors = [c for c in entity.contribs if
        c.role in ('author', None) and
        (c.surname or c.raw_name or (c.creator and c.creator.surname))
    ]
    entity._authors = sorted(authors, key=lambda c: (c.index == None and 99999999) or c.index)
    # need authors, title for citeproc to work
    entity._can_citeproc = bool(entity._authors) and bool(entity.title)
    if entity.abstracts:
        # hack to show plain text instead of latex abstracts
        if 'latex' in entity.abstracts[0].mimetype:
            entity.abstracts.reverse()
        # hack to (partially) clean up common JATS abstract display case
        if entity.abstracts[0].mimetype == 'application/xml+jats':
            for tag in ('p', 'jats', 'jats:p'):
                entity.abstracts[0].content = entity.abstracts[0].content.replace('<{}>'.format(tag), '')
                entity.abstracts[0].content = entity.abstracts[0].content.replace('</{}>'.format(tag), '')
                # ugh, double encoding happens
                entity.abstracts[0].content = entity.abstracts[0].content.replace('&lt;/{}&gt;'.format(tag), '')
                entity.abstracts[0].content = entity.abstracts[0].content.replace('&lt;{}&gt;'.format(tag), '')
    return entity

def enrich_work_entity(entity):
    if entity.state in ('redirect', 'deleted'):
        return entity
    entity._releases = None
    if entity.state in ('active', 'wip'):
        entity._releases = api.get_work_releases(entity.ident)
    return entity

def generic_get_entity(entity_type, ident):
    try:
        if entity_type == 'container':
            return enrich_container_entity(api.get_container(ident))
        elif entity_type == 'creator':
            return enrich_creator_entity(api.get_creator(ident))
        elif entity_type == 'file':
            return enrich_file_entity(api.get_file(ident, expand="releases"))
        elif entity_type == 'fileset':
            return enrich_fileset_entity(api.get_fileset(ident, expand="releases"))
        elif entity_type == 'webcapture':
            return enrich_webcapture_entity(api.get_webcapture(ident, expand="releases"))
        elif entity_type == 'release':
            return enrich_release_entity(api.get_release(ident, expand="container,files,filesets,webcaptures"))
        elif entity_type == 'work':
            return enrich_work_entity(api.get_work(ident))
        else:
            raise NotImplementedError
    except ApiException as ae:
        abort(ae.status)
    except ApiValueError:
        abort(400)

def generic_get_entity_revision(entity_type, revision_id):
    try:
        if entity_type == 'container':
            return enrich_container_entity(api.get_container_revision(revision_id))
        elif entity_type == 'creator':
            return enrich_creator_entity(api.get_creator_revision(revision_id))
        elif entity_type == 'file':
            return enrich_file_entity(api.get_file_revision(revision_id, expand="releases"))
        elif entity_type == 'fileset':
            return enrich_fileset_entity(api.get_fileset_revision(revision_id, expand="releases"))
        elif entity_type == 'webcapture':
            return enrich_webcapture_entity(api.get_webcapture_revision(revision_id, expand="releases"))
        elif entity_type == 'release':
            return enrich_release_entity(api.get_release_revision(revision_id, expand="container"))
        elif entity_type == 'work':
            return enrich_work_entity(api.get_work_revision(revision_id))
        else:
            raise NotImplementedError
    except ApiException as ae:
        abort(ae.status)
    except ApiValueError:
        abort(400)

def generic_get_editgroup_entity(editgroup, entity_type, ident):
    if entity_type == 'container':
        edits = editgroup.edits.containers
    elif entity_type == 'creator':
        edits = editgroup.edits.creators
    elif entity_type == 'file':
        edits = editgroup.edits.files
    elif entity_type == 'fileset':
        edits = editgroup.edits.filesets
    elif entity_type == 'webcapture':
        edits = editgroup.edits.webcaptures
    elif entity_type == 'release':
        edits = editgroup.edits.releases
    elif entity_type == 'work':
        edits = editgroup.edits.works
    else:
        raise NotImplementedError
    revision_id = None
    for e in edits:
        if e.ident == ident:
            revision_id = e.revision
            edit = e
            break
    if not revision_id:
        # couldn't find relevent edit in this editgroup
        abort(404)

    try:
        entity = generic_get_entity_revision(entity_type, revision_id)
    except ApiException as ae:
        abort(ae.status)
    except ApiValueError:
        abort(400)

    entity.ident = ident
    return entity, edit