aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-27 01:19:19 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-27 01:19:19 +0200
commit63517e0ec102ab2c534193bd024dd27016739877 (patch)
treec9f204baa10949ead0e269d842000c738d63fd89
parent715aa435b7d12265cbc041fff126bc1f9f653984 (diff)
downloadrefcat-63517e0ec102ab2c534193bd024dd27016739877.tar.gz
refcat-63517e0ec102ab2c534193bd024dd27016739877.zip
schema: tweaks
add String() to CSLDate; we only cover a few typical cases
-rw-r--r--skate/schema.go86
-rw-r--r--skate/schema_test.go31
2 files changed, 112 insertions, 5 deletions
diff --git a/skate/schema.go b/skate/schema.go
index 7717f9f..18d0e83 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -21,6 +21,10 @@ var (
isbn10Regex = regexp.MustCompile(`[O0-9xX -]{10,18}`)
isbn13Regex = regexp.MustCompile(`9[O0-9xX -]{12,20}`)
+ // Related to CSL processing.
+ yearMonthPat = regexp.MustCompile(`[12][0-9]{3,3}-(0?[1-9]|1[12])`)
+ yearMonthDayPat = regexp.MustCompile(`[12][0-9]{3,3}-(0?[1-9]|1[12])-([0]?[1-9]|[12][0-9]|[3][01])`)
+
// openLibraryDateLayouts, e.g. as found in Open Library Editions,
// .publish_date.
openLibraryDateLayouts = []string{
@@ -73,14 +77,16 @@ type Biblio struct {
Unstructured string `json:"unstructured,omitempty"`
Url string `json:"url,omitempty"`
Volume string `json:"volume,omitempty"`
- Year int64 `json:"year,omitempty"`
- // More non-standard fields go into extra.
+ Year int `json:"year,omitempty"`
+ // Any field we may require as part of our processing should go into an
+ // extra section.
Extra struct {
ISBN []string `json:"isbn"`
} `json:"extra"`
}
-// RefToRelease converts a ref to a release.
+// RefToRelease converts a ref to a release. We want this e.g. for a release to
+// release fuzzy verification, when one of the docs is a ref.
func RefToRelease(ref *Ref) (*Release, error) {
var (
release Release
@@ -103,6 +109,7 @@ func RefToRelease(ref *Ref) (*Release, error) {
release.Volume = b.Volume
release.Issue = b.Issue
release.Pages = b.Pages
+ // Skip some accidental zero or bogus years.
if ref.ReleaseYear > 1000 {
release.ReleaseYearValue = fmt.Sprintf("%d", ref.ReleaseYear)
}
@@ -124,7 +131,7 @@ func RefToRelease(ref *Ref) (*Release, error) {
// ReleaseToUnstructured tries to render a sensible string, e.g. for frontend
// display of unmatched and other relations. Some examples:
-// https://guides.lib.uw.edu/c.php?g=341448&p=4076094 No specific style, just
+// https://guides.lib.uw.edu/c.php?g=341448&p=4076094 - no specific style, just
// try to be readable.
func ReleaseToUnstructured(r *Release) string {
var (
@@ -199,9 +206,10 @@ func ReleaseToUnstructured(r *Release) string {
return sb.String()
}
-// ParseIsbn tries to find and validate ISBN from unstructured data. Returns a
+// ParseIsbn tries to find and validate ISBN from a string. Returns a
// list of unique, unsorted and validated ISBN13, e.g. 9780123838520.
func ParseIsbn(s string) []string {
+ // Some example inputs:
// ISBN: 10: 0137822693, pp: 373
// Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec,
// ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of
@@ -317,6 +325,8 @@ type Release struct {
MetadataVersion int `json:"metadataVersion,omitempty"`
Relations []DataCiteRelation `json:"relations,omitempty"`
} `json:"datacite,omitempty"`
+ // Anything in the "Skate" substruct should be considered private to
+ // skate, nothing to depend upon outside this pipeline.
Skate struct {
// Mark as converted from "ref", "rg" or other schemas.
Status string `json:"status,omitempty"`
@@ -330,6 +340,9 @@ type Release struct {
ResearchGate struct {
URL string `json:"url,omitempty"`
} `json:"rg,omitempty"`
+ // At a point, where we have a release that was a ref and we only
+ // have a partial container name, we can include any discovered
+ // container name here.
ResolvedContainerName string `json:"resolved_container_name"`
} `json:"skate,omitempty"`
OpenLibrary struct {
@@ -471,11 +484,74 @@ type CSL struct {
// as a second array. The second date format is a raw string. The recommended
// encoding is a string that represents the date in a numberic year-month-day
// format.
+// https://docs.citationstyles.org/en/stable/specification.html#date-part, in a
+// json schema: https://git.io/J411z
+//
+// Raw might be removed in the future: https://discourse.citationstyles.org/t/raw-dates-vs-date-parts/1533/12
type CSLDate struct {
Raw string `json:"raw,omitempty"`
Parts [][]int `json:"date-parts,omitempty"`
}
+func (c *CSLDate) String() string {
+ if c.Raw != "" {
+ return c.Raw
+ }
+ // The "date-parts" field has quite some spec around them, also some open
+ // issues (e.g. schema allows for string and numbers, which might lead to
+ // issues like: https://github.com/zotero/zotero/issues/1603).
+ var renderField = func(v interface{}, padding int) string {
+ switch padding {
+ case 2:
+ switch w := v.(type) {
+ case string:
+ return w
+ case int:
+ return fmt.Sprintf("%02d", v)
+ default:
+ return fmt.Sprintf("%v", v)
+ }
+ default:
+ switch w := v.(type) {
+ case string:
+ return w
+ case int:
+ return fmt.Sprintf("%d", v)
+ default:
+ return fmt.Sprintf("%v", v)
+ }
+ }
+ }
+ switch {
+ case len(c.Parts) == 1:
+ switch len(c.Parts[0]) {
+ case 0:
+ return ""
+ case 1:
+ return renderField(c.Parts[0][0], 4)
+ case 2:
+ v := fmt.Sprintf("%s-%s",
+ renderField(c.Parts[0][0], 4),
+ renderField(c.Parts[0][1], 2),
+ )
+ // Assume year, month for the moment.
+ if yearMonthPat.MatchString(v) {
+ return v
+ }
+ case 3:
+ v := fmt.Sprintf("%s-%s-%s",
+ renderField(c.Parts[0][0], 4),
+ renderField(c.Parts[0][1], 2),
+ renderField(c.Parts[0][2], 2))
+ // Assume year, month, day for the moment.
+ if yearMonthDayPat.MatchString(v) {
+ return v
+ }
+ }
+ }
+ return ""
+}
+
// Some personal names are represented by a single field (e.g. mononyms such as
// “Prince” or “Plato”). In such cases, the name can be delivered as a lone
// family element. Institutional names may be delivered in the same way, but it
diff --git a/skate/schema_test.go b/skate/schema_test.go
index 59b1f58..3e783df 100644
--- a/skate/schema_test.go
+++ b/skate/schema_test.go
@@ -364,6 +364,37 @@ func TestReleaseToUnstructured(t *testing.T) {
}
}
+func TestCSLDateString(t *testing.T) {
+ var cases = []struct {
+ date CSLDate
+ s string
+ }{
+ {CSLDate{}, ""},
+ {CSLDate{Raw: "2012"}, "2012"},
+ {CSLDate{Raw: "about 2012"}, "about 2012"},
+ {CSLDate{Raw: "", Parts: nil}, ""},
+ {CSLDate{Raw: "", Parts: [][]int{}}, ""},
+ {CSLDate{Raw: "", Parts: [][]int{
+ []int{2001},
+ }}, "2001"},
+ {CSLDate{Raw: "", Parts: [][]int{
+ []int{2001, 1},
+ }}, "2001-01"},
+ {CSLDate{Raw: "", Parts: [][]int{
+ []int{2001, 1, 1},
+ }}, "2001-01-01"},
+ {CSLDate{Raw: "", Parts: [][]int{
+ []int{2001, 1, 12},
+ }}, "2001-01-12"},
+ }
+ for i, c := range cases {
+ result := c.date.String()
+ if result != c.s {
+ t.Fatalf("[%d] got %v, want %v", i, result, c.s)
+ }
+ }
+}
+
func BenchmarkParseIsbn(b *testing.B) {
for n := 0; n < b.N; n++ {
ParseIsbn("House Pvt. Limited., (2006), ISBN 9788183561426. Date accessed: August 2015.")