diff options
-rw-r--r-- | skate/cmd/skate-conv/main.go | 6 | ||||
-rw-r--r-- | skate/schema.go | 30 |
2 files changed, 22 insertions, 14 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go index 7cbc9bb..334da81 100644 --- a/skate/cmd/skate-conv/main.go +++ b/skate/cmd/skate-conv/main.go @@ -93,7 +93,7 @@ func refToRelease(p []byte) ([]byte, error) { // https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst. func rgSitemapToRelease(p []byte) ([]byte, error) { var ( - s skate.Sitemap + s skate.SitemapEntry release skate.Release err error ) @@ -118,7 +118,7 @@ func rgSitemapToRelease(p []byte) ([]byte, error) { // openLibraryToRelease converts an Open Library work item to a release. func openLibraryToRelease(p []byte) ([]byte, error) { var ( - w skate.OpenLibraryWork + w skate.OpenLibrarySolrDoc release *skate.Release err error ) @@ -129,7 +129,7 @@ func openLibraryToRelease(p []byte) ([]byte, error) { return nil, err } } - if release, err = skate.OpenLibraryWorkToRelease(&w); err != nil { + if release, err = skate.OpenLibrarySolrDocToRelease(&w); err != nil { return nil, err } release.Extra.Skate.Status = "ol" diff --git a/skate/schema.go b/skate/schema.go index 8be2f42..118e124 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -158,10 +158,13 @@ func ParseIsbn(s string) []string { return valid.Slice() } -// Release document. Note that we may have varying types for some fields. -// Mitigation for now is to make the field an interface{}, name the field -// "...Value" and to add a method with the field name, doing type assertion. -// Example: ReleaseYearValue interface{}, ReleaseYear() int, etc. +// Release document. Note that we may have different types for some fields +// (e.g. string, int, etc.). Mitigation for now is to make the field an +// interface{}, name the field "...Value" and to add a method with the field +// name, doing type assertion. Example: ReleaseYearValue interface{}, +// ReleaseYear() int, etc. +// +// Extra field gets a section for "skate" for conversion related values. type Release struct { ContainerID string `json:"container_id,omitempty"` ContainerName string `json:"container_name,omitempty"` @@ -293,14 +296,15 @@ func (r *DataCiteRelation) RelatedIdentifier() string { } } -// Sitemap basic JSON style, e.g. for https://archive.org/details/rg_sitemap_2021_02_23. -type Sitemap struct { +// SitemapEntry in a basic JSON style, e.g. for https://archive.org/details/rg_sitemap_2021_02_23. +type SitemapEntry struct { Lastmod string `json:"lastmod,omitempty"` Title string `json:"title,omitempty"` URL string `json:"url,omitempty"` } -// BiblioRef as a prototype for indexing, https://is.gd/yicTom. +// BiblioRef as a prototype for citation graph elasticsearch indexing, +// https://is.gd/yicTom. type BiblioRef struct { Key string `json:"_id,omitempty"` IndexedTs string `json:"indexed_ts,omitempty"` // https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html @@ -326,6 +330,9 @@ type BiblioRef struct { // ReleaseCluster, a list of match candidates. This is typically serialized as a // single JSON line containing the match key and a list of release documents. +// +// Deprecated, since we are moving to a "two stream" generic "join" style +// processing. type ReleaseCluster struct { Key string `json:"k"` Values []*Release `json:"v"` @@ -412,8 +419,9 @@ func (c *MinimalCitations) ParseIDList() (result IDList) { return result } -// OpenLibraryWork from data dump (solr). -type OpenLibraryWork struct { +// OpenLibrarySolrDoc from data dump (solr). Note: we most likely only need OL +// editions. +type OpenLibrarySolrDoc struct { AuthorFacet []string `json:"author_facet"` AuthorKey []string `json:"author_key"` AuthorName []string `json:"author_name"` @@ -442,8 +450,8 @@ type OpenLibraryWork struct { Version int64 `json:"_version_"` } -// OpenLibraryWorkToRelease convert OL data into a release. -func OpenLibraryWorkToRelease(w *OpenLibraryWork) (*Release, error) { +// OpenLibrarySolrDocToRelease convert OL data into a release. +func OpenLibrarySolrDocToRelease(w *OpenLibrarySolrDoc) (*Release, error) { var ( release Release contribs = make([]struct { |