aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-27 23:38:52 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-27 23:38:52 +0200
commit0396db2bc11eaee8f2058e82cc1b5ea0af9f0956 (patch)
treeefa38ba29fa31c7ac9debd96311d7b486d0f2973 /skate
parent0cf00f57575fb71e79d9a4b1bd7b3d59a682c63a (diff)
downloadrefcat-0396db2bc11eaee8f2058e82cc1b5ea0af9f0956.tar.gz
refcat-0396db2bc11eaee8f2058e82cc1b5ea0af9f0956.zip
ref: parse out isbn
Diffstat (limited to 'skate')
-rw-r--r--skate/fixtures/ref_with_isbn.json13
-rw-r--r--skate/schema.go44
-rw-r--r--skate/schema_test.go1
3 files changed, 58 insertions, 0 deletions
diff --git a/skate/fixtures/ref_with_isbn.json b/skate/fixtures/ref_with_isbn.json
new file mode 100644
index 0000000..2cd8480
--- /dev/null
+++ b/skate/fixtures/ref_with_isbn.json
@@ -0,0 +1,13 @@
+{
+ "biblio": {
+ "title": "Antibiotic Resistant Bacteria -A Continuous Challenge in the New Millennium Edited by Dr. Marina Pana ISBN",
+ "unstructured": "www.intechopen.com Antibiotic Resistant Bacteria -A Continuous Challenge in the New Millennium Edited by Dr. Marina Pana ISBN 978-953-51-0472-8"
+ },
+ "index": 443,
+ "key": "b443",
+ "ref_source": "grobid",
+ "release_ident": "n4zvrgchmfexdb6gesxfgxykxi",
+ "release_year": 2012,
+ "work_ident": "aaan6iujevgpnmcif2hb62uaai"
+}
+
diff --git a/skate/schema.go b/skate/schema.go
index a9b1e8a..6c96bb8 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -2,6 +2,7 @@ package skate
import (
"fmt"
+ "regexp"
"strconv"
"strings"
@@ -9,6 +10,11 @@ import (
"git.archive.org/martin/cgraph/skate/set"
)
+var (
+ isbn10Regex = regexp.MustCompile(`[0-9xX -]{10,18}`)
+ isbn13Regex = regexp.MustCompile(`9[0-9xX -]{12,20}`)
+)
+
// RefToRelease converts a ref to a release. Set a extra.skate.status flag to
// be able to distinguish converted entities later.
func RefToRelease(ref *Ref) (*Release, error) {
@@ -41,6 +47,44 @@ func RefToRelease(ref *Ref) (*Release, error) {
contribs[i].RawName = name
}
release.Contribs = contribs
+ // XXX: Find ISBN in unstructured. Might be expensive, do we need a flag?
+ unlo := strings.ToLower(ref.Biblio.Unstructured)
+ if strings.Contains(unlo, "isbn") {
+ // ISBN: 10: 0137822693, pp: 373
+ // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec,
+ // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of
+ // Communication. The Bell System Technical Journal. July; October,
+ // Vol. 27, pp. 379-423; 623-656.
+ // Artech House, ISBN: 978-1-60807-201-9, 2011.
+ // ...
+ var (
+ candidates10 = isbn10Regex.FindAllString(ref.Biblio.Unstructured, -1)
+ candidates13 = isbn13Regex.FindAllString(ref.Biblio.Unstructured, -1)
+ valid = set.New()
+ )
+ for _, v := range append(candidates10, candidates13...) {
+ var u []rune
+ for _, c := range v {
+ if c >= '0' && c <= '9' || c == 'x' || c == 'X' {
+ u = append(u, c)
+ }
+ }
+ s := string(u)
+ if !isbn.Validate(s) {
+ continue
+ }
+ if len(s) < 12 {
+ w, err := isbn.To13(s)
+ if err != nil {
+ continue
+ }
+ valid.Add(w)
+ } else {
+ valid.Add(s)
+ }
+ }
+ release.ExtIDs.ISBN = valid.Slice()
+ }
return &release, nil
}
diff --git a/skate/schema_test.go b/skate/schema_test.go
index 6a95115..c1cec35 100644
--- a/skate/schema_test.go
+++ b/skate/schema_test.go
@@ -9,6 +9,7 @@ import (
"github.com/nsf/jsondiff"
)
+// XXX: Work on JSON directly, as structs can get unwieldy.
func TestOpenLibraryToRelease(t *testing.T) {
var cases = []struct {
work OpenLibraryWork