From 63517e0ec102ab2c534193bd024dd27016739877 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 27 Jul 2021 01:19:19 +0200 Subject: schema: tweaks add String() to CSLDate; we only cover a few typical cases --- skate/schema.go | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--- skate/schema_test.go | 31 +++++++++++++++++++ 2 files changed, 112 insertions(+), 5 deletions(-) diff --git a/skate/schema.go b/skate/schema.go index 7717f9f..18d0e83 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -21,6 +21,10 @@ var ( isbn10Regex = regexp.MustCompile(`[O0-9xX -]{10,18}`) isbn13Regex = regexp.MustCompile(`9[O0-9xX -]{12,20}`) + // Related to CSL processing. + yearMonthPat = regexp.MustCompile(`[12][0-9]{3,3}-(0?[1-9]|1[12])`) + yearMonthDayPat = regexp.MustCompile(`[12][0-9]{3,3}-(0?[1-9]|1[12])-([0]?[1-9]|[12][0-9]|[3][01])`) + // openLibraryDateLayouts, e.g. as found in Open Library Editions, // .publish_date. openLibraryDateLayouts = []string{ @@ -73,14 +77,16 @@ type Biblio struct { Unstructured string `json:"unstructured,omitempty"` Url string `json:"url,omitempty"` Volume string `json:"volume,omitempty"` - Year int64 `json:"year,omitempty"` - // More non-standard fields go into extra. + Year int `json:"year,omitempty"` + // Any field we may require as part of our processing should go into an + // extra section. Extra struct { ISBN []string `json:"isbn"` } `json:"extra"` } -// RefToRelease converts a ref to a release. +// RefToRelease converts a ref to a release. We want this e.g. for a release to +// release fuzzy verification, when one of the docs is a ref. func RefToRelease(ref *Ref) (*Release, error) { var ( release Release @@ -103,6 +109,7 @@ func RefToRelease(ref *Ref) (*Release, error) { release.Volume = b.Volume release.Issue = b.Issue release.Pages = b.Pages + // Skip some accidental zero or bogus years. if ref.ReleaseYear > 1000 { release.ReleaseYearValue = fmt.Sprintf("%d", ref.ReleaseYear) } @@ -124,7 +131,7 @@ func RefToRelease(ref *Ref) (*Release, error) { // ReleaseToUnstructured tries to render a sensible string, e.g. for frontend // display of unmatched and other relations. Some examples: -// https://guides.lib.uw.edu/c.php?g=341448&p=4076094 No specific style, just +// https://guides.lib.uw.edu/c.php?g=341448&p=4076094 - no specific style, just // try to be readable. func ReleaseToUnstructured(r *Release) string { var ( @@ -199,9 +206,10 @@ func ReleaseToUnstructured(r *Release) string { return sb.String() } -// ParseIsbn tries to find and validate ISBN from unstructured data. Returns a +// ParseIsbn tries to find and validate ISBN from a string. Returns a // list of unique, unsorted and validated ISBN13, e.g. 9780123838520. func ParseIsbn(s string) []string { + // Some example inputs: // ISBN: 10: 0137822693, pp: 373 // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of @@ -317,6 +325,8 @@ type Release struct { MetadataVersion int `json:"metadataVersion,omitempty"` Relations []DataCiteRelation `json:"relations,omitempty"` } `json:"datacite,omitempty"` + // Anything in the "Skate" substruct should be considered private to + // skate, nothing to depend upon outside this pipeline. Skate struct { // Mark as converted from "ref", "rg" or other schemas. Status string `json:"status,omitempty"` @@ -330,6 +340,9 @@ type Release struct { ResearchGate struct { URL string `json:"url,omitempty"` } `json:"rg,omitempty"` + // At a point, where we have a release that was a ref and we only + // have a partial container name, we can include any discovered + // container name here. ResolvedContainerName string `json:"resolved_container_name"` } `json:"skate,omitempty"` OpenLibrary struct { @@ -471,11 +484,74 @@ type CSL struct { // as a second array. The second date format is a raw string. The recommended // encoding is a string that represents the date in a numberic year-month-day // format. +// https://docs.citationstyles.org/en/stable/specification.html#date-part, in a +// json schema: https://git.io/J411z +// +// Raw might be removed in the future: https://discourse.citationstyles.org/t/raw-dates-vs-date-parts/1533/12 type CSLDate struct { Raw string `json:"raw,omitempty"` Parts [][]int `json:"date-parts,omitempty"` } +func (c *CSLDate) String() string { + if c.Raw != "" { + return c.Raw + } + // The "date-parts" field has quite some spec around them, also some open + // issues (e.g. schema allows for string and numbers, which might lead to + // issues like: https://github.com/zotero/zotero/issues/1603). + var renderField = func(v interface{}, padding int) string { + switch padding { + case 2: + switch w := v.(type) { + case string: + return w + case int: + return fmt.Sprintf("%02d", v) + default: + return fmt.Sprintf("%v", v) + } + default: + switch w := v.(type) { + case string: + return w + case int: + return fmt.Sprintf("%d", v) + default: + return fmt.Sprintf("%v", v) + } + } + } + switch { + case len(c.Parts) == 1: + switch len(c.Parts[0]) { + case 0: + return "" + case 1: + return renderField(c.Parts[0][0], 4) + case 2: + v := fmt.Sprintf("%s-%s", + renderField(c.Parts[0][0], 4), + renderField(c.Parts[0][1], 2), + ) + // Assume year, month for the moment. + if yearMonthPat.MatchString(v) { + return v + } + case 3: + v := fmt.Sprintf("%s-%s-%s", + renderField(c.Parts[0][0], 4), + renderField(c.Parts[0][1], 2), + renderField(c.Parts[0][2], 2)) + // Assume year, month, day for the moment. + if yearMonthDayPat.MatchString(v) { + return v + } + } + } + return "" +} + // Some personal names are represented by a single field (e.g. mononyms such as // “Prince” or “Plato”). In such cases, the name can be delivered as a lone // family element. Institutional names may be delivered in the same way, but it diff --git a/skate/schema_test.go b/skate/schema_test.go index 59b1f58..3e783df 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -364,6 +364,37 @@ func TestReleaseToUnstructured(t *testing.T) { } } +func TestCSLDateString(t *testing.T) { + var cases = []struct { + date CSLDate + s string + }{ + {CSLDate{}, ""}, + {CSLDate{Raw: "2012"}, "2012"}, + {CSLDate{Raw: "about 2012"}, "about 2012"}, + {CSLDate{Raw: "", Parts: nil}, ""}, + {CSLDate{Raw: "", Parts: [][]int{}}, ""}, + {CSLDate{Raw: "", Parts: [][]int{ + []int{2001}, + }}, "2001"}, + {CSLDate{Raw: "", Parts: [][]int{ + []int{2001, 1}, + }}, "2001-01"}, + {CSLDate{Raw: "", Parts: [][]int{ + []int{2001, 1, 1}, + }}, "2001-01-01"}, + {CSLDate{Raw: "", Parts: [][]int{ + []int{2001, 1, 12}, + }}, "2001-01-12"}, + } + for i, c := range cases { + result := c.date.String() + if result != c.s { + t.Fatalf("[%d] got %v, want %v", i, result, c.s) + } + } +} + func BenchmarkParseIsbn(b *testing.B) { for n := 0; n < b.N; n++ { ParseIsbn("House Pvt. Limited., (2006), ISBN 9788183561426. Date accessed: August 2015.") -- cgit v1.2.3