From 0396db2bc11eaee8f2058e82cc1b5ea0af9f0956 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 27 Apr 2021 23:38:52 +0200 Subject: ref: parse out isbn --- skate/fixtures/ref_with_isbn.json | 13 ++++++++++++ skate/schema.go | 44 +++++++++++++++++++++++++++++++++++++++ skate/schema_test.go | 1 + 3 files changed, 58 insertions(+) create mode 100644 skate/fixtures/ref_with_isbn.json (limited to 'skate') diff --git a/skate/fixtures/ref_with_isbn.json b/skate/fixtures/ref_with_isbn.json new file mode 100644 index 0000000..2cd8480 --- /dev/null +++ b/skate/fixtures/ref_with_isbn.json @@ -0,0 +1,13 @@ +{ + "biblio": { + "title": "Antibiotic Resistant Bacteria -A Continuous Challenge in the New Millennium Edited by Dr. Marina Pana ISBN", + "unstructured": "www.intechopen.com Antibiotic Resistant Bacteria -A Continuous Challenge in the New Millennium Edited by Dr. Marina Pana ISBN 978-953-51-0472-8" + }, + "index": 443, + "key": "b443", + "ref_source": "grobid", + "release_ident": "n4zvrgchmfexdb6gesxfgxykxi", + "release_year": 2012, + "work_ident": "aaan6iujevgpnmcif2hb62uaai" +} + diff --git a/skate/schema.go b/skate/schema.go index a9b1e8a..6c96bb8 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -2,6 +2,7 @@ package skate import ( "fmt" + "regexp" "strconv" "strings" @@ -9,6 +10,11 @@ import ( "git.archive.org/martin/cgraph/skate/set" ) +var ( + isbn10Regex = regexp.MustCompile(`[0-9xX -]{10,18}`) + isbn13Regex = regexp.MustCompile(`9[0-9xX -]{12,20}`) +) + // RefToRelease converts a ref to a release. Set a extra.skate.status flag to // be able to distinguish converted entities later. func RefToRelease(ref *Ref) (*Release, error) { @@ -41,6 +47,44 @@ func RefToRelease(ref *Ref) (*Release, error) { contribs[i].RawName = name } release.Contribs = contribs + // XXX: Find ISBN in unstructured. Might be expensive, do we need a flag? + unlo := strings.ToLower(ref.Biblio.Unstructured) + if strings.Contains(unlo, "isbn") { + // ISBN: 10: 0137822693, pp: 373 + // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, + // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of + // Communication. The Bell System Technical Journal. July; October, + // Vol. 27, pp. 379-423; 623-656. + // Artech House, ISBN: 978-1-60807-201-9, 2011. + // ... + var ( + candidates10 = isbn10Regex.FindAllString(ref.Biblio.Unstructured, -1) + candidates13 = isbn13Regex.FindAllString(ref.Biblio.Unstructured, -1) + valid = set.New() + ) + for _, v := range append(candidates10, candidates13...) { + var u []rune + for _, c := range v { + if c >= '0' && c <= '9' || c == 'x' || c == 'X' { + u = append(u, c) + } + } + s := string(u) + if !isbn.Validate(s) { + continue + } + if len(s) < 12 { + w, err := isbn.To13(s) + if err != nil { + continue + } + valid.Add(w) + } else { + valid.Add(s) + } + } + release.ExtIDs.ISBN = valid.Slice() + } return &release, nil } diff --git a/skate/schema_test.go b/skate/schema_test.go index 6a95115..c1cec35 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -9,6 +9,7 @@ import ( "github.com/nsf/jsondiff" ) +// XXX: Work on JSON directly, as structs can get unwieldy. func TestOpenLibraryToRelease(t *testing.T) { var cases = []struct { work OpenLibraryWork -- cgit v1.2.3