aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-01 12:55:53 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-01 12:55:53 +0200
commit3c9df2ab9dfbbfcf35e5b7e9415a1123df641bf3 (patch)
tree555b53a2520de37f2e50579af7ceea3350f46fdc /skate
parent562713dd62970b2bfd155fc68ad3f5a49699ae1f (diff)
downloadrefcat-3c9df2ab9dfbbfcf35e5b7e9415a1123df641bf3.tar.gz
refcat-3c9df2ab9dfbbfcf35e5b7e9415a1123df641bf3.zip
tweak naming and docs
Diffstat (limited to 'skate')
-rw-r--r--skate/zippy.go53
1 files changed, 27 insertions, 26 deletions
diff --git a/skate/zippy.go b/skate/zippy.go
index e7677b9..b69ce2b 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -54,12 +54,12 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer)
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
}
- if target, err = stringToRelease(Cut(g.G0[0], 2)); err != nil {
+ if target, err = parseRelease(Cut(g.G0[0], 2)); err != nil {
groupLogf(g, "[skip] failed to parse release: %v", err)
return nil
}
for _, line := range g.G1 {
- if ref, err = stringToRef(Cut(line, 2)); err != nil {
+ if ref, err = parseRef(Cut(line, 2)); err != nil {
groupLogf(g, "[skip] failed to parse ref: %v", err)
continue
}
@@ -87,8 +87,8 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer)
}
// ZippyExactReleases takes two release readers (key, doc) and assigns a fixed
-// match result.
-func ZippyExactReleases(olReader, reReader io.Reader, matchResult MatchResult, w io.Writer) error {
+// match result, e.g. used with release entities converted from open library snapshots.
+func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.Writer) error {
var (
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
@@ -105,12 +105,12 @@ func ZippyExactReleases(olReader, reReader io.Reader, matchResult MatchResult, w
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
}
- if target, err = stringToRelease(Cut(g.G0[0], 2)); err != nil {
+ if target, err = parseRelease(Cut(g.G0[0], 2)); err != nil {
groupLogf(g, "[skip] failed to parse release: %v", err)
return nil
}
for _, line := range g.G1 {
- if re, err = stringToRelease(Cut(line, 2)); err != nil {
+ if re, err = parseRelease(Cut(line, 2)); err != nil {
groupLogf(g, "[skip] failed to parse release: %v", err)
continue
}
@@ -135,7 +135,7 @@ func ZippyExactReleases(olReader, reReader io.Reader, matchResult MatchResult, w
return nil
}
)
- zipper := zipkey.New(olReader, reReader, keyer, grouper)
+ zipper := zipkey.New(olr, releases, keyer, grouper)
return zipper.Run()
}
@@ -154,11 +154,11 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
}
- if target, err = stringToRelease(Cut(g.G0[0], 2)); err != nil {
+ if target, err = parseRelease(Cut(g.G0[0], 2)); err != nil {
return err
}
for _, line := range g.G1 {
- if wiki, err = stringToWiki(Cut(line, 2)); err != nil {
+ if wiki, err = parseWiki(Cut(line, 2)); err != nil {
return err
}
var bref BiblioRef
@@ -180,8 +180,9 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
return zipper.Run()
}
-// ZippyVerifyRefs takes a release and refs reader (key, doc), run fuzzy
-// verification and will emit a biblioref document, if exact or strong match.
+// ZippyVerifyRefs takes a release and refs (as release) reader (key, doc), run
+// fuzzy verification and will emit a biblioref document, if exact or strong
+// match.
func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error {
var (
enc = json.NewEncoder(w)
@@ -194,11 +195,11 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error {
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
}
- if pivot, err = stringToRelease(Cut(g.G0[0], 2)); err != nil {
+ if pivot, err = parseRelease(Cut(g.G0[0], 2)); err != nil {
return err
}
for _, line := range g.G1 {
- if re, err = stringToRelease(Cut(line, 2)); err != nil {
+ if re, err = parseRelease(Cut(line, 2)); err != nil {
return err
}
result := Verify(pivot, re)
@@ -212,9 +213,6 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error {
return err
}
default:
- // XXX: We want to add unmatched pieces as well; here? We
- // probably want to do a single final pass to complete the
- // dataset.
}
}
return nil
@@ -238,11 +236,11 @@ func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error {
return nil
}
// We take a single edition from OL.
- if pivot, err = stringToRelease(Cut(g.G0[0], 2)); err != nil {
+ if pivot, err = parseRelease(Cut(g.G0[0], 2)); err != nil {
return err
}
for _, line := range g.G1 {
- if re, err = stringToRelease(Cut(line, 2)); err != nil {
+ if re, err = parseRelease(Cut(line, 2)); err != nil {
return err
}
// The refs have a container name, but not a title, but here we
@@ -267,7 +265,7 @@ func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error {
}
// ZippyVerifyRefsOpenLibrary takes OL editions (as release) and refs (as
-// release) and emits a match table for manual inspection.
+// release) and writes biblioref.
func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
var (
enc = json.NewEncoder(w)
@@ -281,11 +279,11 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
return nil
}
// We take a single edition from OL.
- if pivot, err = stringToRelease(Cut(g.G0[0], 2)); err != nil {
+ if pivot, err = parseRelease(Cut(g.G0[0], 2)); err != nil {
return err
}
for _, line := range g.G1 {
- if ref, err = stringToRelease(Cut(line, 2)); err != nil {
+ if ref, err = parseRelease(Cut(line, 2)); err != nil {
return err
}
// The refs have a container name, but not a title, but here we
@@ -320,11 +318,14 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
// ZippyBrefAugment takes all matched docs from bref and adds docs from raw
// refs, which have not been matched. It also gets rid of duplicate matches.
+// Note: This operates on two streams: raw refs with about 2.5B (07/2021) and
+// matches, which will be about 1B; in essence we have to iterate through about
+// 3.5B records; small tweak here may be worthwhile.
//
// We can identify, which docs have been matched by checking the source ident,
// ref index and key.
//
-// TODO: This needs to be completed.
+// TODO: This needs to be completed and made fast.
func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
var (
stats = statsAugment{}
@@ -557,22 +558,22 @@ func makeKeyFunc(sep string, column int) func(string) (string, error) {
}
}
-func stringToRelease(s string) (r *Release, err error) {
+func parseRelease(s string) (r *Release, err error) {
err = json.Unmarshal([]byte(s), &r)
return
}
-func stringToRef(s string) (r *Ref, err error) {
+func parseRef(s string) (r *Ref, err error) {
err = json.Unmarshal([]byte(s), &r)
return
}
-func stringToWiki(s string) (r *MinimalCitations, err error) {
+func parseWiki(s string) (r *MinimalCitations, err error) {
err = json.Unmarshal([]byte(s), &r)
return
}
-func stringToBiblioref(s string) (r *BiblioRef, err error) {
+func parseBiblioref(s string) (r *BiblioRef, err error) {
err = json.Unmarshal([]byte(s), &r)
return
}