diff options
Diffstat (limited to 'skate')
-rw-r--r-- | skate/schema.go | 102 |
1 files changed, 49 insertions, 53 deletions
diff --git a/skate/schema.go b/skate/schema.go index a4e7ef3..8be2f42 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -13,10 +13,13 @@ import ( ) var ( + // Various ISBN patterns tailored towards more dirty values, e.g. with + // artifacts from OCR, etc. isbn10Regex = regexp.MustCompile(`[O0-9xX -]{10,18}`) isbn13Regex = regexp.MustCompile(`9[O0-9xX -]{12,20}`) - // E.g. as found in editions, .publish_date. + // openLibraryDateLayouts, e.g. as found in Open Library Editions, + // .publish_date. openLibraryDateLayouts = []string{ "1 Jan 2006", "2006", @@ -38,6 +41,41 @@ var ( } ) +// Ref is a reference document (derived from "heavy intermediate schema"), can +// be very partial. +type Ref struct { + Biblio Biblio `json:"biblio"` + Index int64 `json:"index,omitempty"` + Key string `json:"key,omitempty"` + RefSource string `json:"ref_source,omitempty"` + ReleaseYear int `json:"release_year,omitempty"` + ReleaseIdent string `json:"release_ident,omitempty"` + ReleaseStage string `json:"release_stage,omitempty"` + WorkIdent string `json:"work_ident,omitempty"` +} + +// Biblio contains bibliographic information for a Ref. +type Biblio struct { + ArxivId string `json:"arxiv_id,omitempty"` + ContainerName string `json:"container_name,omitempty"` + ContribRawNames []string `json:"contrib_raw_names,omitempty"` + DOI string `json:"doi,omitempty"` + Issue string `json:"issue,omitempty"` + PMCID string `json:"pmcid,omitempty"` + PMID string `json:"pmid,omitempty"` + Pages string `json:"pages,omitempty"` + Publisher string `json:"publisher,omitempty"` + Title string `json:"title,omitempty"` + Unstructured string `json:"unstructured,omitempty"` + Url string `json:"url,omitempty"` + Volume string `json:"volume,omitempty"` + Year int64 `json:"year,omitempty"` + // More non-standard fields go into extra. + Extra struct { + ISBN []string `json:"isbn"` + } `json:"extra"` +} + // RefToRelease converts a ref to a release. func RefToRelease(ref *Ref) (*Release, error) { var ( @@ -89,6 +127,7 @@ func ParseIsbn(s string) []string { candidates13 = isbn13Regex.FindAllString(s, -1) u []rune z string + err error ) valid := setPool.Get().(set.Set) valid.Clear() @@ -100,7 +139,7 @@ func ParseIsbn(s string) []string { if c == 'O' { c = '0' } - if c >= '0' && c <= '9' || c == 'x' || c == 'X' { + if c >= '0' && c <= '9' || c == 'X' || c == 'x' { u = append(u, c) } } @@ -110,51 +149,15 @@ func ParseIsbn(s string) []string { continue } if len(z) < 12 { - w, err := isbn.To13(z) - if err != nil { + if z, err = isbn.To13(z); err != nil { continue } - valid.Add(w) - } else { - valid.Add(z) } + valid.Add(z) } return valid.Slice() } -type Biblio struct { - ArxivId string `json:"arxiv_id,omitempty"` - ContainerName string `json:"container_name,omitempty"` - ContribRawNames []string `json:"contrib_raw_names,omitempty"` - DOI string `json:"doi,omitempty"` - Issue string `json:"issue,omitempty"` - PMCID string `json:"pmcid,omitempty"` - PMID string `json:"pmid,omitempty"` - Pages string `json:"pages,omitempty"` - Publisher string `json:"publisher,omitempty"` - Title string `json:"title,omitempty"` - Unstructured string `json:"unstructured,omitempty"` - Url string `json:"url,omitempty"` - Volume string `json:"volume,omitempty"` - Year int64 `json:"year,omitempty"` - // More non-standard fields go into extra. - Extra struct { - ISBN []string `json:"isbn"` - } `json:"extra"` -} - -// Ref is a reference document, can be very partial. -type Ref struct { - Biblio Biblio `json:"biblio"` - Index int64 `json:"index,omitempty"` - Key string `json:"key,omitempty"` - RefSource string `json:"ref_source,omitempty"` - ReleaseYear int `json:"release_year,omitempty"` - ReleaseIdent string `json:"release_ident,omitempty"` - ReleaseStage string `json:"release_stage,omitempty"` - WorkIdent string `json:"work_ident,omitempty"` -} - // Release document. Note that we may have varying types for some fields. // Mitigation for now is to make the field an interface{}, name the field // "...Value" and to add a method with the field name, doing type assertion. @@ -439,7 +442,7 @@ type OpenLibraryWork struct { Version int64 `json:"_version_"` } -// OpenLibraryWorkToRelease convert OL data into a release. XXX: release/work? +// OpenLibraryWorkToRelease convert OL data into a release. func OpenLibraryWorkToRelease(w *OpenLibraryWork) (*Release, error) { var ( release Release @@ -476,6 +479,7 @@ func OpenLibraryWorkToRelease(w *OpenLibraryWork) (*Release, error) { return &release, nil } +// OpenLibraryEdition document, see: https://openlibrary.org/developers/dumps. type OpenLibraryEdition struct { Authors []struct { Key string `json:"key"` @@ -509,22 +513,12 @@ type OpenLibraryEdition struct { // null // { // "type": "/type/text", - // "value": "Includes index." - // } - // null - // { - // "type": "/type/text", // "value": "Includes bibliographical references (p. 137-143)." // } // null // "Includes bibliographical references (p. 203-205) and index." // null - // null Notes interface{} `json:"notes"` - // Notes struct { - // Type string `json:"type"` - // Value string `json:"value"` - // } `json:"notes"` NumberOfPages int64 `json:"number_of_pages"` Ocaid string `json:"ocaid"` @@ -547,7 +541,7 @@ type OpenLibraryEdition struct { } `json:"works"` } -// Isbns returns all found ISBN. Unique, non-normalized. +// Isbns returns all found ISBN: unique, sorted, non-normalized. func (v OpenLibraryEdition) Isbns() []string { s := set.New() for _, w := range v.Isbn10 { @@ -559,7 +553,9 @@ func (v OpenLibraryEdition) Isbns() []string { return s.Sorted() } -// OpenLibraryEditionToRelease convert OL data into a release. XXX: release/work? +// OpenLibraryEditionToRelease convert OL edition record into a release. Takes +// a (potentially empty) author map to replace author keys (like +// "/author/OL18273A") with names. func OpenLibraryEditionToRelease(v *OpenLibraryEdition, authorMap map[string]string) (*Release, error) { var ( release Release |