tweaks; move parsing out of command

author: Martin Czygan <martin.czygan@gmail.com> 2021-05-04 23:59:53 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-05-04 23:59:53 +0200
commit: a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9 (patch)
tree: dbfeb167e56b3d581877a0224e56be8423852aa0 /skate
parent: 6462e64ce8e61f54e1c3b1247c2039a2eddd5875 (diff)
download: refcat-a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9.tar.gz
refcat-a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9.zip
3 files changed, 81 insertions, 72 deletions
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
index 754eab8..de11de1 100644
--- a/skate/cmd/skate-cluster/main.go
+++ b/skate/cmd/skate-cluster/main.go
@@ -1,5 +1,5 @@
-// skate-cluster takes the (tab) output of skate-sorted-keys and generates a
-// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
+// skate-cluster takes the (tab) output of skate-map (plus sort) and generates
+// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
 // require refs and release docs in a single cluster).
 //
 // For example, this:
@@ -44,10 +44,12 @@ func main() {
 		batch, fields  []string
 		keyIndex       = *keyField - 1
 		docIndex       = *docField - 1
+		line           string
+		err            error
 	)
 	defer bw.Flush()
 	for {
-		line, err := br.ReadString('\n')
+		line, err = br.ReadString('\n')
 		if err == io.EOF {
 			break
 		}
@@ -79,16 +81,16 @@ func main() {
 
 // containsBoth return true, if we have a ref and a non-ref item in the batch.
 func containsBoth(batch []string) bool {
-	var isRef int
+	var numRef int
 	for _, doc := range batch {
-		// This is brittle. Most JSON should be in compact form, and there the
-		// following chars are by convention added to distinguish a release
-		// coming from a reference doc from other releases.
+		// This is brittle (but faster). Most JSON should be in compact form,
+		// and there the following chars are by convention added to distinguish
+		// a release coming from a reference doc from other releases.
 		if strings.Contains(doc, `"status":"ref"`) {
-			isRef++
+			numRef++
 		}
 	}
-	return isRef > 0 && isRef < len(batch)
+	return numRef > 0 && numRef < len(batch)
 }
 
 // writeBatch writes out a single line containing the key and the cluster values.
@@ -102,9 +104,9 @@ func writeBatch(w io.Writer, key string, batch []string) (err error) {
 	if *requireBoth && !containsBoth(batch) {
 		return nil
 	}
-	// This is brittle, but all items in a batch are valid JSON objects, hence,
-	// the following will be valid JSON as well, or will it? The key should not
-	// contain a quote.
+	// This is brittle (and fast), but all items in a batch are valid JSON
+	// objects, hence, the following will be valid JSON as well, or will it?
+	// The key should not contain a quote.
 	_, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
 	return
 }
diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go
index c2015e2..179057d 100644
--- a/skate/cmd/skate-from-unstructured/main.go
+++ b/skate/cmd/skate-from-unstructured/main.go
@@ -6,9 +6,7 @@ import (
 	"flag"
 	"log"
 	"os"
-	"regexp"
 	"runtime"
-	"strings"
 
 	"git.archive.org/martin/cgraph/skate"
 	"git.archive.org/martin/cgraph/skate/parallel"
@@ -19,11 +17,6 @@ var (
 	numWorkers   = flag.Int("w", runtime.NumCPU(), "number of workers")
 	batchSize    = flag.Int("b", 100000, "batch size")
 	bytesNewline = []byte("\n")
-
-	PatDOI         = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
-	PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
-	PatArxivPDF    = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
-	PatArxivAbs    = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
 )
 
 func main() {
@@ -32,7 +25,7 @@ func main() {
 		if err := json.Unmarshal(p, &ref); err != nil {
 			return nil, err
 		}
-		if err := parseUnstructured(&ref); err != nil {
+		if err := skate.ParseUnstructured(&ref); err != nil {
 			return nil, err
 		}
 		return skate.JsonMarshalLine(&ref)
@@ -43,55 +36,3 @@ func main() {
 		log.Fatal(err)
 	}
 }
-
-// parseUnstructured will in-place augment missing DOI, arxiv id and so on.
-func parseUnstructured(ref *skate.Ref) error {
-	uns := ref.Biblio.Unstructured
-	var (
-		v  string
-		vs []string
-	)
-	// Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
-	// 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
-	if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
-		parts := strings.Split(strings.ToLower(ref.Key), "-bib")
-		ref.Biblio.DOI = parts[0]
-	}
-	// DOI
-	v = PatDOI.FindString(uns)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// DOI in Key
-	v = PatDOINoHyphen.FindString(ref.Key)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// DOI in URL
-	prefixes := []string{
-		"http://doi.org/",
-		"https://doi.org/",
-		"http://dx.doi.org/",
-		"https://dx.doi.org/",
-	}
-	for _, prefix := range prefixes {
-		if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
-			ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
-		}
-	}
-	v = PatDOINoHyphen.FindString(ref.Key)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// Arxiv
-	vs = PatArxivPDF.FindStringSubmatch(uns)
-	if len(vs) != 0 && ref.Biblio.ArxivId == "" {
-		ref.Biblio.ArxivId = vs[1]
-	} else {
-		vs = PatArxivAbs.FindStringSubmatch(uns)
-		if len(vs) != 0 && ref.Biblio.ArxivId == "" {
-			ref.Biblio.ArxivId = vs[1]
-		}
-	}
-	return nil
-}
diff --git a/skate/unstructured.go b/skate/unstructured.go
new file mode 100644
index 0000000..6a96bb0
--- /dev/null
+++ b/skate/unstructured.go
@@ -0,0 +1,66 @@
+package skate
+
+import (
+	"regexp"
+	"strings"
+)
+
+var (
+	PatDOI         = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+	PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
+	PatArxivPDF    = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+	PatArxivAbs    = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+
+	urlPrefixes = []string{
+		"http://doi.org/",
+		"https://doi.org/",
+		"http://dx.doi.org/",
+		"https://dx.doi.org/",
+	}
+)
+
+// ParseUnstructured will in-place augment missing DOI, arxiv id and so on.
+func ParseUnstructured(ref *Ref) error {
+	var (
+		uns = ref.Biblio.Unstructured
+		v   string
+		vs  []string
+	)
+	// Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
+	// 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
+	if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
+		parts := strings.Split(strings.ToLower(ref.Key), "-bib")
+		ref.Biblio.DOI = parts[0]
+	}
+	// DOI
+	v = PatDOI.FindString(uns)
+	if v != "" && ref.Biblio.DOI == "" {
+		ref.Biblio.DOI = v
+	}
+	// DOI in Key
+	v = PatDOINoHyphen.FindString(ref.Key)
+	if v != "" && ref.Biblio.DOI == "" {
+		ref.Biblio.DOI = v
+	}
+	// DOI in URL
+	for _, prefix := range urlPrefixes {
+		if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
+			ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
+		}
+	}
+	v = PatDOINoHyphen.FindString(ref.Key)
+	if v != "" && ref.Biblio.DOI == "" {
+		ref.Biblio.DOI = v
+	}
+	// Arxiv
+	vs = PatArxivPDF.FindStringSubmatch(uns)
+	if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+		ref.Biblio.ArxivId = vs[1]
+	} else {
+		vs = PatArxivAbs.FindStringSubmatch(uns)
+		if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+			ref.Biblio.ArxivId = vs[1]
+		}
+	}
+	return nil
+}
author	Martin Czygan <martin.czygan@gmail.com>	2021-05-04 23:59:53 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-05-04 23:59:53 +0200
commit	a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9 (patch)
tree	dbfeb167e56b3d581877a0224e56be8423852aa0 /skate
parent	6462e64ce8e61f54e1c3b1247c2039a2eddd5875 (diff)
download	refcat-a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9.tar.gz refcat-a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9.zip