1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
import collections
from fatcat_client import ReleaseEntity, ApiClient
def entity_to_dict(entity):
"""
Hack to take advantage of the code-generated serialization code
"""
ac = ApiClient()
return ac.sanitize_for_serialization(entity)
def entity_from_json(json_str, entity_type):
"""
Hack to take advantage of the code-generated deserialization code
"""
ac = ApiClient()
thing = collections.namedtuple('Thing', ['data'])
thing.data = json_str
return ac.deserialize(thing, entity_type)
def release_to_elasticsearch(release):
"""
Converts from an entity model/schema to elasticsearch oriented schema.
Returns: dict
Raises exception on error (never returns None)
"""
if release.state in ('redirect', 'deleted'):
return dict(
ident = release.ident,
state = release.state,
)
elif release.state != 'active':
raise ValueError("Unhandled release state: {}".format(release.state))
# First, the easy ones (direct copy)
t = dict(
ident = release.ident,
state = release.state,
revision = release.revision,
title = release.title,
original_title = release.original_title,
release_type = release.release_type,
release_status = release.release_status,
language = release.language,
license = release.license_slug,
doi = release.doi,
pmid = release.pmid,
pmcid = release.pmcid,
isbn13 = release.isbn13,
wikidata_qid = release.wikidata_qid,
core_id = release.core_id,
arxiv_id = release.core_id,
jstor_id = release.jstor_id,
)
is_oa = None
is_longtail_oa = None
in_kbart = None
in_web = False
in_dweb = False
in_ia = False
in_shadow = False
if release.release_date:
# .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
t['release_date'] = release.release_date.isoformat()
if release.release_year is None:
t['release_year'] = release.release_date.year
if release.release_year is not None:
t['release_year'] = release.release_year
t['any_abstract'] = len(release.abstracts) > 0
t['ref_count'] = len(release.refs or [])
t['contrib_count'] = len(release.contribs or [])
contrib_names = []
for c in (release.contribs or []):
if c.raw_name:
contrib_names.append(c.raw_name)
t['contrib_names'] = contrib_names
container = release.container
if container:
t['publisher'] = container.publisher
t['container_name'] = container.name
t['container_issnl'] = container.issnl
t['container_type'] = container.container_type
if container.extra:
if container.extra.get('is_oa') or container.extra.get('in_doaj'):
is_oa = True
if container.extra.get('in_kbart'):
# TODO: better KBART check goes here
in_kbart = True
if container.extra.get('ia'):
# TODO: container longtail check goes here
# TODO: sim/microfilm check goes here
pass
# TODO: SHERPA/Romeo goes here
else:
t['publisher'] = release.publisher
files = release.files or []
t['file_count'] = len(files)
t['fileset_count'] = len(release.filesets or [])
t['webcapture_count'] = len(release.webcaptures or [])
any_pdf_url = None
good_pdf_url = None
best_pdf_url = None
ia_pdf_url = None
for f in files:
if f.extra and f.extra.get('shadows'):
# TODO: shadow check goes here
in_shadows = True
is_pdf = 'pdf' in (f.mimetype or '')
for url in (f.urls or []):
if url.url.lower().startswith('http'):
in_web = True
if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
# TODO: not sure what rel will be
in_dweb = True
if is_pdf:
any_pdf_url = url.url
if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
is_preserved = True
good_pdf_url = url.url
if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
in_ia = True
if is_pdf:
best_pdf_url = url.url
ia_pdf_url = url.url
# here is where we bake-in priority; IA-specific
t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
t['ia_pdf_url'] = ia_pdf_url
if release.license_slug:
# TODO: more/better checks here, particularly strict *not* OA licenses
if release.license_slug.startswith("CC-"):
is_oa = True
extra = release.extra or dict()
if extra:
# TODO: longtail OA check from GROBID here
if extra.get('in_kbart'):
# NOTE: not actually setting this anywhere
in_kbart = True
if extra.get('is_oa'):
# NOTE: not actually setting this anywhere
is_oa = True
if extra.get('grobid'):
if not t.get('container_name'):
t['container_name'] = extra['grobid'].get('container_name')
if extra['grobid'].get('longtail_oa'):
is_longtail_oa = True
if extra.get('crossref'):
if extra['crossref'].get('archive'):
# all crossref archives are KBART, I believe
in_kbart = True
if is_longtail_oa:
is_oa = True
t['is_oa'] = is_oa
t['is_longtail_oa'] = is_longtail_oa
t['in_kbart'] = in_kbart
t['in_web'] = in_web
t['in_dweb'] = in_dweb
t['in_ia'] = in_ia
t['is_preserved'] = in_ia or in_kbart
return t
|