From 59afe1e58dcaed9c328d72fcdffb0f669c34d6b6 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 28 Apr 2021 00:00:18 +0200 Subject: parse out isbn from refs --- skate/schema.go | 71 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 34 deletions(-) (limited to 'skate') diff --git a/skate/schema.go b/skate/schema.go index 6c96bb8..6f872eb 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -47,45 +47,48 @@ func RefToRelease(ref *Ref) (*Release, error) { contribs[i].RawName = name } release.Contribs = contribs - // XXX: Find ISBN in unstructured. Might be expensive, do we need a flag? - unlo := strings.ToLower(ref.Biblio.Unstructured) - if strings.Contains(unlo, "isbn") { - // ISBN: 10: 0137822693, pp: 373 - // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, - // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of - // Communication. The Bell System Technical Journal. July; October, - // Vol. 27, pp. 379-423; 623-656. - // Artech House, ISBN: 978-1-60807-201-9, 2011. - // ... - var ( - candidates10 = isbn10Regex.FindAllString(ref.Biblio.Unstructured, -1) - candidates13 = isbn13Regex.FindAllString(ref.Biblio.Unstructured, -1) - valid = set.New() - ) - for _, v := range append(candidates10, candidates13...) { - var u []rune - for _, c := range v { - if c >= '0' && c <= '9' || c == 'x' || c == 'X' { - u = append(u, c) - } + if strings.Contains(strings.ToLower(ref.Biblio.Unstructured), "isbn") { + release.ExtIDs.ISBN = parseIsbn(ref.Biblio.Unstructured) + } + return &release, nil +} + +// parseIsbn tries to find and validate ISBN from unstrucuted data. +func parseIsbn(s string) []string { + // ISBN: 10: 0137822693, pp: 373 + // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, + // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of + // Communication. The Bell System Technical Journal. July; October, + // Vol. 27, pp. 379-423; 623-656. + // Artech House, ISBN: 978-1-60807-201-9, 2011. + // ... + var ( + candidates10 = isbn10Regex.FindAllString(s, -1) + candidates13 = isbn13Regex.FindAllString(s, -1) + valid = set.New() + ) + for _, v := range append(candidates10, candidates13...) { + var u []rune + for _, c := range v { + if c >= '0' && c <= '9' || c == 'x' || c == 'X' { + u = append(u, c) } - s := string(u) - if !isbn.Validate(s) { + } + s := string(u) + if !isbn.Validate(s) { + continue + } + if len(s) < 12 { + w, err := isbn.To13(s) + if err != nil { continue } - if len(s) < 12 { - w, err := isbn.To13(s) - if err != nil { - continue - } - valid.Add(w) - } else { - valid.Add(s) - } + valid.Add(w) + } else { + valid.Add(s) } - release.ExtIDs.ISBN = valid.Slice() } - return &release, nil + return valid.Slice() } // Ref is a reference document, can be very partial. -- cgit v1.2.3