aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-31 23:50:52 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-03-31 23:50:52 +0200
commita7e0e8fe3a78fa6a9872cefcb7a9eb261128f6f2 (patch)
tree2850c2ed0ecd0392deab2fe82c30c78465f1576c /skate
parentcedef349990c12747efdbe10f692c57436dc5c91 (diff)
downloadrefcat-a7e0e8fe3a78fa6a9872cefcb7a9eb261128f6f2.tar.gz
refcat-a7e0e8fe3a78fa6a9872cefcb7a9eb261128f6f2.zip
cleanup command
Diffstat (limited to 'skate')
-rw-r--r--skate/Makefile2
-rw-r--r--skate/cmd/skate-cluster-stats/main.go92
2 files changed, 1 insertions, 93 deletions
diff --git a/skate/Makefile b/skate/Makefile
index ccfbe9e..e4c584d 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-biblioref-from-wikipedia
+TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-biblioref-from-wikipedia
PKGNAME := skate
.PHONY: test
diff --git a/skate/cmd/skate-cluster-stats/main.go b/skate/cmd/skate-cluster-stats/main.go
deleted file mode 100644
index 4973b4d..0000000
--- a/skate/cmd/skate-cluster-stats/main.go
+++ /dev/null
@@ -1,92 +0,0 @@
-package main
-
-import (
- "flag"
- "fmt"
- "log"
- "os"
- "runtime"
-
- jsoniter "github.com/json-iterator/go"
- "git.archive.org/martin/cgraph/skate"
- "git.archive.org/martin/cgraph/skate/parallel"
-)
-
-var (
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 100000, "batch size")
- bestEffort = flag.Bool("B", false, "best effort, log errors")
- // unmatched: clusters w/ refs only
- // count: number of entities in cluster (by type)
- // default: key and number of values
- mode = flag.String("m", "", "what to extract (unmatched, count, ...)")
-
- json = jsoniter.ConfigCompatibleWithStandardLibrary
- bytesNewline = []byte("\n")
-)
-
-type Func func([]byte) ([]byte, error)
-
-func main() {
- flag.Parse()
- var f Func
- switch *mode {
- case "unmatched":
- f = func(p []byte) ([]byte, error) {
- var cluster skate.ReleaseCluster
- if err := json.Unmarshal(p, &cluster); err != nil {
- if *bestEffort {
- log.Printf("%v", err)
- return nil, nil
- }
- log.Fatal(err)
- }
- var refs int
- for _, v := range cluster.Values {
- if v.Extra.Skate.Status == "ref" {
- refs++
- }
- }
- if refs == len(cluster.Values) {
- return p, nil
- }
- return nil, nil
- }
- case "count":
- f = func(p []byte) ([]byte, error) {
- var cluster skate.ReleaseCluster
- if err := json.Unmarshal(p, &cluster); err != nil {
- if *bestEffort {
- log.Printf("%v", err)
- return nil, nil
- }
- log.Fatal(err)
- }
- var refs int
- for _, v := range cluster.Values {
- if v.Extra.Skate.Status == "ref" {
- refs++
- }
- }
- // total, refs, non-refs, key
- s := fmt.Sprintf("%d\t%d\t%d\t%s\n",
- len(cluster.Values), refs, len(cluster.Values)-refs, cluster.Key)
- return []byte(s), nil
- }
- default:
- f = func(p []byte) ([]byte, error) {
- var cluster skate.ReleaseCluster
- if err := json.Unmarshal(p, &cluster); err != nil {
- return nil, err
- }
- s := fmt.Sprintf("%d\t%s\n", len(cluster.Values), cluster.Key)
- return []byte(s), nil
- }
- }
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
-}