diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-04-27 23:38:52 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-04-27 23:38:52 +0200 |
commit | 0396db2bc11eaee8f2058e82cc1b5ea0af9f0956 (patch) | |
tree | efa38ba29fa31c7ac9debd96311d7b486d0f2973 /skate | |
parent | 0cf00f57575fb71e79d9a4b1bd7b3d59a682c63a (diff) | |
download | refcat-0396db2bc11eaee8f2058e82cc1b5ea0af9f0956.tar.gz refcat-0396db2bc11eaee8f2058e82cc1b5ea0af9f0956.zip |
ref: parse out isbn
Diffstat (limited to 'skate')
-rw-r--r-- | skate/fixtures/ref_with_isbn.json | 13 | ||||
-rw-r--r-- | skate/schema.go | 44 | ||||
-rw-r--r-- | skate/schema_test.go | 1 |
3 files changed, 58 insertions, 0 deletions
diff --git a/skate/fixtures/ref_with_isbn.json b/skate/fixtures/ref_with_isbn.json new file mode 100644 index 0000000..2cd8480 --- /dev/null +++ b/skate/fixtures/ref_with_isbn.json @@ -0,0 +1,13 @@ +{ + "biblio": { + "title": "Antibiotic Resistant Bacteria -A Continuous Challenge in the New Millennium Edited by Dr. Marina Pana ISBN", + "unstructured": "www.intechopen.com Antibiotic Resistant Bacteria -A Continuous Challenge in the New Millennium Edited by Dr. Marina Pana ISBN 978-953-51-0472-8" + }, + "index": 443, + "key": "b443", + "ref_source": "grobid", + "release_ident": "n4zvrgchmfexdb6gesxfgxykxi", + "release_year": 2012, + "work_ident": "aaan6iujevgpnmcif2hb62uaai" +} + diff --git a/skate/schema.go b/skate/schema.go index a9b1e8a..6c96bb8 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -2,6 +2,7 @@ package skate import ( "fmt" + "regexp" "strconv" "strings" @@ -9,6 +10,11 @@ import ( "git.archive.org/martin/cgraph/skate/set" ) +var ( + isbn10Regex = regexp.MustCompile(`[0-9xX -]{10,18}`) + isbn13Regex = regexp.MustCompile(`9[0-9xX -]{12,20}`) +) + // RefToRelease converts a ref to a release. Set a extra.skate.status flag to // be able to distinguish converted entities later. func RefToRelease(ref *Ref) (*Release, error) { @@ -41,6 +47,44 @@ func RefToRelease(ref *Ref) (*Release, error) { contribs[i].RawName = name } release.Contribs = contribs + // XXX: Find ISBN in unstructured. Might be expensive, do we need a flag? + unlo := strings.ToLower(ref.Biblio.Unstructured) + if strings.Contains(unlo, "isbn") { + // ISBN: 10: 0137822693, pp: 373 + // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, + // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of + // Communication. The Bell System Technical Journal. July; October, + // Vol. 27, pp. 379-423; 623-656. + // Artech House, ISBN: 978-1-60807-201-9, 2011. + // ... + var ( + candidates10 = isbn10Regex.FindAllString(ref.Biblio.Unstructured, -1) + candidates13 = isbn13Regex.FindAllString(ref.Biblio.Unstructured, -1) + valid = set.New() + ) + for _, v := range append(candidates10, candidates13...) { + var u []rune + for _, c := range v { + if c >= '0' && c <= '9' || c == 'x' || c == 'X' { + u = append(u, c) + } + } + s := string(u) + if !isbn.Validate(s) { + continue + } + if len(s) < 12 { + w, err := isbn.To13(s) + if err != nil { + continue + } + valid.Add(w) + } else { + valid.Add(s) + } + } + release.ExtIDs.ISBN = valid.Slice() + } return &release, nil } diff --git a/skate/schema_test.go b/skate/schema_test.go index 6a95115..c1cec35 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -9,6 +9,7 @@ import ( "github.com/nsf/jsondiff" ) +// XXX: Work on JSON directly, as structs can get unwieldy. func TestOpenLibraryToRelease(t *testing.T) { var cases = []struct { work OpenLibraryWork |