1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
|
package skate
import (
"fmt"
"strconv"
)
// RefToRelease converts a ref to a release. Set a extra.skate.status flag to
// be able to distinguish converted entities later.
func RefToRelease(ref *Ref) (*Release, error) {
var (
release Release
b = ref.Biblio
contribs = make([]struct {
Index int `json:"index,omitempty"`
RawName string `json:"raw_name,omitempty"`
Role string `json:"role,omitempty"`
}, len(b.ContribRawNames))
)
release.Ident = ref.ReleaseIdent
release.WorkID = ref.WorkIdent
release.ExtIDs.Arxiv = b.ArxivId
release.ExtIDs.DOI = b.DOI
release.ExtIDs.PMID = b.PMID
release.ExtIDs.PMCID = b.PMCID
release.Title = b.Title
release.Publisher = b.Publisher
release.ContainerName = b.ContainerName
release.Volume = b.Volume
release.Issue = b.Issue
release.Pages = b.Pages
if ref.ReleaseYear > 1000 {
release.ReleaseYearValue = fmt.Sprintf("%d", ref.ReleaseYear)
}
for i, name := range b.ContribRawNames {
contribs[i].Index = i
contribs[i].RawName = name
}
release.Contribs = contribs
return &release, nil
}
// Ref is a reference document, can be very partial.
type Ref struct {
Biblio struct {
ArxivId string `json:"arxiv_id,omitempty"`
ContainerName string `json:"container_name,omitempty"`
ContribRawNames []string `json:"contrib_raw_names,omitempty"`
DOI string `json:"doi,omitempty"`
Issue string `json:"issue,omitempty"`
PMCID string `json:"pmcid,omitempty"`
PMID string `json:"pmid,omitempty"`
Pages string `json:"pages,omitempty"`
Publisher string `json:"publisher,omitempty"`
Title string `json:"title,omitempty"`
Unstructured string `json:"unstructured,omitempty"`
Url string `json:"url,omitempty"`
Volume string `json:"volume,omitempty"`
Year int64 `json:"year,omitempty"`
} `json:"biblio"`
Index int64 `json:"index,omitempty"`
Key string `json:"key,omitempty"`
RefSource string `json:"ref_source,omitempty"`
ReleaseYear int `json:"release_year,omitempty"`
ReleaseIdent string `json:"release_ident,omitempty"`
ReleaseStage string `json:"release_stage,omitempty"`
WorkIdent string `json:"work_ident,omitempty"`
}
// Release document. Note that we may have varying types for some fields.
// Mitigation for now is to make the field an interface{}, name the field
// "...Value" and to add a method with the field name, doing type assertion.
// Example: ReleaseYearValue interface{}, ReleaseYear() int, etc.
type Release struct {
ContainerID string `json:"container_id,omitempty"`
ContainerName string `json:"container_name,omitempty"`
Contribs []struct {
Index int `json:"index,omitempty"`
RawName string `json:"raw_name,omitempty"`
Role string `json:"role,omitempty"`
} `json:"contribs,omitempty"`
ExtIDs struct {
DOI string `json:"doi,omitempty"`
PMID string `json:"pmid,omitempty"`
PMCID string `json:"pmcid,omitempty"`
Arxiv string `json:"arxiv,omitempty"`
Core string `json:"core,omitempty"`
WikidataQID string `json:"wikidata_qid,omitempty"`
Jstor string `json:"jstor,omitempty"`
} `json:"ext_ids,omitempty"`
Ident string `json:"ident,omitempty"`
Publisher string `json:"publisher,omitempty"`
Refs []struct {
ContainerName string `json:"container_name,omitempty"`
Extra struct {
DOI string `json:"doi,omitempty"`
Authors []string `json:"authors,omitempty"`
Key string `json:"key,omitempty"`
Year string `json:"year,omitempty"`
Locator string `json:"locator,omitempty"`
Volume string `json:"volume,omitempty"`
} `json:"extra"`
Index int64 `json:"index,omitempty"`
Key string `json:"key,omitempty"`
Locator string `json:"locator,omitempty"`
Year int64 `json:"year,omitempty"`
} `json:"refs,omitempty"`
ReleaseDate string `json:"release_date,omitempty"`
ReleaseYearValue interface{} `json:"release_year,omitempty"` // might be int or str
ReleaseStage string `json:"release_stage,omitempty"`
ReleaseType string `json:"release_type,omitempty"`
Issue string `json:"issue,omitempty"`
Volume string `json:"volume,omitempty"`
Pages string `json:"pages,omitempty"`
Title string `json:"title,omitempty"`
WorkID string `json:"work_id,omitempty"`
Extra struct {
ContainerName string `json:"container_name,omitempty"`
SubtitleValue interface{} `json:"subtitle,omitempty"` // []str or str
Crossref struct {
Type string `json:"type,omitempty"`
} `json:"crossref,omitempty"`
DataCite struct {
MetadataVersion int `json:"metadataVersion,omitempty"`
Relations []DataCiteRelation `json:"relations,omitempty"`
} `json:"datacite,omitempty"`
Skate struct {
// Mark as converted (e.g. by setting status to "ref")
Status string `json:"status,omitempty"`
// Carry the ref index and key around.
Ref struct {
Index int64 `json:"index,omitempty"`
Key string `json:"key,omitempty"`
Locator string `json:"locator,omitempty"`
} `json:"ref,omitempty"`
ResearchGate struct {
URL string `json:"url,omitempty"`
} `json:"rg,omitempty"`
} `json:"skate,omitempty"`
} `json:"extra,omitempty"`
}
// Subtitle returns a slice of subtitle strings.
func (r *Release) Subtitle() (result []string) {
switch v := r.Extra.SubtitleValue.(type) {
case []interface{}:
for _, e := range v {
result = append(result, fmt.Sprintf("%v", e))
}
return result
case []string:
return v
case string:
return []string{v}
}
return []string{}
}
// ReleaseYearString returns release year as string.
func (r *Release) ReleaseYearString() string {
return fmt.Sprintf("%d", r.ReleaseYear())
}
// ReleaseYear returns year as int, no further validity checks.
func (r *Release) ReleaseYear() int {
switch v := r.ReleaseYearValue.(type) {
case int:
return v
case float64:
return int(v)
case string:
w, err := strconv.Atoi(v)
if err != nil {
return 0
}
return w
default:
return 0
}
}
// DataCiteRelation as it appears in the release extra field.
type DataCiteRelation struct {
RelatedIdentifierType string `json:"relatedIdentifierType,omitempty"`
RelatedIdentifierValue interface{} `json:"relatedIdentifier,omitempty"`
}
// RelatedIdentifier returns the identifier as string.
func (r *DataCiteRelation) RelatedIdentifier() string {
switch v := r.RelatedIdentifierValue.(type) {
case string:
return v
default:
return fmt.Sprintf("%v", v)
}
}
// Sitemap basic JSON style, e.g. for https://archive.org/details/rg_sitemap_2021_02_23.
type Sitemap struct {
Lastmod string `json:"lastmod,omitempty"`
Title string `json:"title,omitempty"`
URL string `json:"url,omitempty"`
}
// BiblioRef as a prototype for indexing, https://is.gd/yicTom.
type BiblioRef struct {
Key string `json:"_id,omitempty"`
UpdateTs int64 `json:"update_ts,omitempty"` // XXX: maybe: epoch_millis, https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html
SourceReleaseIdent string `json:"source_release_ident,omitempty"`
SourceWorkIdent string `json:"source_work_ident,omitempty"`
SourceWikipediaArticle string `json:"source_wikipedia_article,omitempty"`
SourceReleaseStage string `json:"source_release_stage,omitempty"`
SourceYear string `json:"source_year,omitempty"`
RefIndex int64 `json:"ref_index,omitempty"`
RefKey string `json:"ref_key,omitempty"`
RefLocator string `json:"ref_locator,omitempty"`
TargetReleaseIdent string `json:"target_release_ident,omitempty"`
TargetWorkIdent string `json:"target_work_ident,omitempty"`
TargetOpenLibraryWork string `json:"target_openlibrary_work,omitempty"`
TargetURLSurt string `json:"target_url_surt,omitempty"`
TargetURL string `json:"target_url,omitempty"`
MatchProvenance string `json:"match_provenance,omitempty"`
MatchStatus string `json:"match_status,omitempty"`
MatchReason string `json:"match_reason,omitempty"`
TargetUnstructured string `json:"target_unstructured,omitempty"`
TargetCSL string `json:"target_csl,omitempty"`
}
// ClusterResult results.
type ClusterResult struct {
Key string `json:"k"`
Values []*Release `json:"v"`
}
// NonRef returns the first non-reference release found in a cluster, or an
// error, if none has been found.
func (cr *ClusterResult) OneNonRef() (*Release, error) {
for _, re := range cr.Values {
if re.Extra.Skate.Status != "ref" {
return re, nil
}
}
return nil, fmt.Errorf("no reference/release found")
}
// Group is a cluster with explicit groups (e.g. store the json lines in A, B).
type Group struct {
Key string
A []string
B []string
}
func (g *Group) String() string {
return fmt.Sprintf("<Group A/B %d/%d>", len(g.A), len(g.B))
}
|