aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/schema.go102
1 files changed, 49 insertions, 53 deletions
diff --git a/skate/schema.go b/skate/schema.go
index a4e7ef3..8be2f42 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -13,10 +13,13 @@ import (
)
var (
+ // Various ISBN patterns tailored towards more dirty values, e.g. with
+ // artifacts from OCR, etc.
isbn10Regex = regexp.MustCompile(`[O0-9xX -]{10,18}`)
isbn13Regex = regexp.MustCompile(`9[O0-9xX -]{12,20}`)
- // E.g. as found in editions, .publish_date.
+ // openLibraryDateLayouts, e.g. as found in Open Library Editions,
+ // .publish_date.
openLibraryDateLayouts = []string{
"1 Jan 2006",
"2006",
@@ -38,6 +41,41 @@ var (
}
)
+// Ref is a reference document (derived from "heavy intermediate schema"), can
+// be very partial.
+type Ref struct {
+ Biblio Biblio `json:"biblio"`
+ Index int64 `json:"index,omitempty"`
+ Key string `json:"key,omitempty"`
+ RefSource string `json:"ref_source,omitempty"`
+ ReleaseYear int `json:"release_year,omitempty"`
+ ReleaseIdent string `json:"release_ident,omitempty"`
+ ReleaseStage string `json:"release_stage,omitempty"`
+ WorkIdent string `json:"work_ident,omitempty"`
+}
+
+// Biblio contains bibliographic information for a Ref.
+type Biblio struct {
+ ArxivId string `json:"arxiv_id,omitempty"`
+ ContainerName string `json:"container_name,omitempty"`
+ ContribRawNames []string `json:"contrib_raw_names,omitempty"`
+ DOI string `json:"doi,omitempty"`
+ Issue string `json:"issue,omitempty"`
+ PMCID string `json:"pmcid,omitempty"`
+ PMID string `json:"pmid,omitempty"`
+ Pages string `json:"pages,omitempty"`
+ Publisher string `json:"publisher,omitempty"`
+ Title string `json:"title,omitempty"`
+ Unstructured string `json:"unstructured,omitempty"`
+ Url string `json:"url,omitempty"`
+ Volume string `json:"volume,omitempty"`
+ Year int64 `json:"year,omitempty"`
+ // More non-standard fields go into extra.
+ Extra struct {
+ ISBN []string `json:"isbn"`
+ } `json:"extra"`
+}
+
// RefToRelease converts a ref to a release.
func RefToRelease(ref *Ref) (*Release, error) {
var (
@@ -89,6 +127,7 @@ func ParseIsbn(s string) []string {
candidates13 = isbn13Regex.FindAllString(s, -1)
u []rune
z string
+ err error
)
valid := setPool.Get().(set.Set)
valid.Clear()
@@ -100,7 +139,7 @@ func ParseIsbn(s string) []string {
if c == 'O' {
c = '0'
}
- if c >= '0' && c <= '9' || c == 'x' || c == 'X' {
+ if c >= '0' && c <= '9' || c == 'X' || c == 'x' {
u = append(u, c)
}
}
@@ -110,51 +149,15 @@ func ParseIsbn(s string) []string {
continue
}
if len(z) < 12 {
- w, err := isbn.To13(z)
- if err != nil {
+ if z, err = isbn.To13(z); err != nil {
continue
}
- valid.Add(w)
- } else {
- valid.Add(z)
}
+ valid.Add(z)
}
return valid.Slice()
}
-type Biblio struct {
- ArxivId string `json:"arxiv_id,omitempty"`
- ContainerName string `json:"container_name,omitempty"`
- ContribRawNames []string `json:"contrib_raw_names,omitempty"`
- DOI string `json:"doi,omitempty"`
- Issue string `json:"issue,omitempty"`
- PMCID string `json:"pmcid,omitempty"`
- PMID string `json:"pmid,omitempty"`
- Pages string `json:"pages,omitempty"`
- Publisher string `json:"publisher,omitempty"`
- Title string `json:"title,omitempty"`
- Unstructured string `json:"unstructured,omitempty"`
- Url string `json:"url,omitempty"`
- Volume string `json:"volume,omitempty"`
- Year int64 `json:"year,omitempty"`
- // More non-standard fields go into extra.
- Extra struct {
- ISBN []string `json:"isbn"`
- } `json:"extra"`
-}
-
-// Ref is a reference document, can be very partial.
-type Ref struct {
- Biblio Biblio `json:"biblio"`
- Index int64 `json:"index,omitempty"`
- Key string `json:"key,omitempty"`
- RefSource string `json:"ref_source,omitempty"`
- ReleaseYear int `json:"release_year,omitempty"`
- ReleaseIdent string `json:"release_ident,omitempty"`
- ReleaseStage string `json:"release_stage,omitempty"`
- WorkIdent string `json:"work_ident,omitempty"`
-}
-
// Release document. Note that we may have varying types for some fields.
// Mitigation for now is to make the field an interface{}, name the field
// "...Value" and to add a method with the field name, doing type assertion.
@@ -439,7 +442,7 @@ type OpenLibraryWork struct {
Version int64 `json:"_version_"`
}
-// OpenLibraryWorkToRelease convert OL data into a release. XXX: release/work?
+// OpenLibraryWorkToRelease convert OL data into a release.
func OpenLibraryWorkToRelease(w *OpenLibraryWork) (*Release, error) {
var (
release Release
@@ -476,6 +479,7 @@ func OpenLibraryWorkToRelease(w *OpenLibraryWork) (*Release, error) {
return &release, nil
}
+// OpenLibraryEdition document, see: https://openlibrary.org/developers/dumps.
type OpenLibraryEdition struct {
Authors []struct {
Key string `json:"key"`
@@ -509,22 +513,12 @@ type OpenLibraryEdition struct {
// null
// {
// "type": "/type/text",
- // "value": "Includes index."
- // }
- // null
- // {
- // "type": "/type/text",
// "value": "Includes bibliographical references (p. 137-143)."
// }
// null
// "Includes bibliographical references (p. 203-205) and index."
// null
- // null
Notes interface{} `json:"notes"`
- // Notes struct {
- // Type string `json:"type"`
- // Value string `json:"value"`
- // } `json:"notes"`
NumberOfPages int64 `json:"number_of_pages"`
Ocaid string `json:"ocaid"`
@@ -547,7 +541,7 @@ type OpenLibraryEdition struct {
} `json:"works"`
}
-// Isbns returns all found ISBN. Unique, non-normalized.
+// Isbns returns all found ISBN: unique, sorted, non-normalized.
func (v OpenLibraryEdition) Isbns() []string {
s := set.New()
for _, w := range v.Isbn10 {
@@ -559,7 +553,9 @@ func (v OpenLibraryEdition) Isbns() []string {
return s.Sorted()
}
-// OpenLibraryEditionToRelease convert OL data into a release. XXX: release/work?
+// OpenLibraryEditionToRelease convert OL edition record into a release. Takes
+// a (potentially empty) author map to replace author keys (like
+// "/author/OL18273A") with names.
func OpenLibraryEditionToRelease(v *OpenLibraryEdition, authorMap map[string]string) (*Release, error) {
var (
release Release