aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-05 15:55:39 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-05 15:55:39 +0200
commit634b7b7d910ddb20c5af0722de41ef5ccded7358 (patch)
treed83f5fb36dc4c98035511059202fc51dc676ee54 /skate/cmd
parenta380bffa5fb0cf20ee84ede6fa590bf38e3675f8 (diff)
parent134752c2a160986c13d6c2b9428cb2720ed382d0 (diff)
downloadrefcat-634b7b7d910ddb20c5af0722de41ef5ccded7358.tar.gz
refcat-634b7b7d910ddb20c5af0722de41ef5ccded7358.zip
Merge branch 'master' of git.archive.org:martin/cgraph
* 'master' of git.archive.org:martin/cgraph: (24 commits) update notes make: run go mod tidy after build add test for ParseUnstructured remove stub file tweaks; move parsing out of command skate-map: a bit more help output update docs set: some tweaks update README update deps start overview docs update README update docs map is a reference type fix a typo implement a few flags as mapper middleware update ignore files update deps rename skate-ref-to-release to skate-conv update README ...
Diffstat (limited to 'skate/cmd')
-rw-r--r--skate/cmd/skate-bref-unmatched/main.go10
-rw-r--r--skate/cmd/skate-cluster/main.go26
-rw-r--r--skate/cmd/skate-conv/main.go (renamed from skate/cmd/skate-ref-to-release/main.go)60
-rw-r--r--skate/cmd/skate-dot/main.go5
-rw-r--r--skate/cmd/skate-from-unstructured/main.go61
-rw-r--r--skate/cmd/skate-map/main.go65
-rw-r--r--skate/cmd/skate-wikipedia-doi/main.go1
7 files changed, 100 insertions, 128 deletions
diff --git a/skate/cmd/skate-bref-unmatched/main.go b/skate/cmd/skate-bref-unmatched/main.go
deleted file mode 100644
index d8cb34f..0000000
--- a/skate/cmd/skate-bref-unmatched/main.go
+++ /dev/null
@@ -1,10 +0,0 @@
-// skate-bref-unmatched takes a bref TSV sorted by source_release_ident and a
-// refs file sorted by release_ident and exports a bref file that will include
-// unmatched references as well.
-package main
-
-import "log"
-
-func main() {
- log.Println("skate-bref-unmatched")
-}
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
index 754eab8..de11de1 100644
--- a/skate/cmd/skate-cluster/main.go
+++ b/skate/cmd/skate-cluster/main.go
@@ -1,5 +1,5 @@
-// skate-cluster takes the (tab) output of skate-sorted-keys and generates a
-// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
+// skate-cluster takes the (tab) output of skate-map (plus sort) and generates
+// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
// require refs and release docs in a single cluster).
//
// For example, this:
@@ -44,10 +44,12 @@ func main() {
batch, fields []string
keyIndex = *keyField - 1
docIndex = *docField - 1
+ line string
+ err error
)
defer bw.Flush()
for {
- line, err := br.ReadString('\n')
+ line, err = br.ReadString('\n')
if err == io.EOF {
break
}
@@ -79,16 +81,16 @@ func main() {
// containsBoth return true, if we have a ref and a non-ref item in the batch.
func containsBoth(batch []string) bool {
- var isRef int
+ var numRef int
for _, doc := range batch {
- // This is brittle. Most JSON should be in compact form, and there the
- // following chars are by convention added to distinguish a release
- // coming from a reference doc from other releases.
+ // This is brittle (but faster). Most JSON should be in compact form,
+ // and there the following chars are by convention added to distinguish
+ // a release coming from a reference doc from other releases.
if strings.Contains(doc, `"status":"ref"`) {
- isRef++
+ numRef++
}
}
- return isRef > 0 && isRef < len(batch)
+ return numRef > 0 && numRef < len(batch)
}
// writeBatch writes out a single line containing the key and the cluster values.
@@ -102,9 +104,9 @@ func writeBatch(w io.Writer, key string, batch []string) (err error) {
if *requireBoth && !containsBoth(batch) {
return nil
}
- // This is brittle, but all items in a batch are valid JSON objects, hence,
- // the following will be valid JSON as well, or will it? The key should not
- // contain a quote.
+ // This is brittle (and fast), but all items in a batch are valid JSON
+ // objects, hence, the following will be valid JSON as well, or will it?
+ // The key should not contain a quote.
_, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
return
}
diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-conv/main.go
index d547e62..647472e 100644
--- a/skate/cmd/skate-ref-to-release/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -1,5 +1,9 @@
-// skate-ref-to-release converts a "ref" document to a "release" document.
+// skate-conv converts various schemas into releases. This should replace the
+// very specific skate-ref-to-release and the like.
//
+// $ skate-conv -f ref < FILE > FILE
+//
+// Currently source schemas: "ref", "ol", "rg"
package main
import (
@@ -10,19 +14,38 @@ import (
"strings"
"git.archive.org/martin/cgraph/skate"
- "github.com/miku/parallel"
-
+ "git.archive.org/martin/cgraph/skate/parallel"
json "github.com/segmentio/encoding/json"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
- fromFormat = flag.String("f", "ref", "import data shape")
+ fromFormat = flag.String("f", "ref", "import schema")
bytesNewline = []byte("\n")
+ f func([]byte) ([]byte, error)
)
+func main() {
+ flag.Parse()
+ switch *fromFormat {
+ case "ref":
+ f = refToRelease
+ case "rg":
+ f = rgSitemapToRelease
+ case "ol":
+ f = openLibraryToRelease
+ }
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
+
+// refToRelease converts a ref document to a release.
func refToRelease(p []byte) ([]byte, error) {
var ref skate.Ref
if err := json.Unmarshal(p, &ref); err != nil {
@@ -60,22 +83,17 @@ func rgSitemapToRelease(p []byte) ([]byte, error) {
return b, err
}
-func main() {
- flag.Parse()
- switch *fromFormat {
- case "ref":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "rg":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
+func openLibraryToRelease(p []byte) ([]byte, error) {
+ var w skate.OpenLibraryWork
+ if err := json.Unmarshal(p, &w); err != nil {
+ return nil, err
}
+ release, err := skate.OpenLibraryToRelease(&w)
+ if err != nil {
+ return nil, err
+ }
+ release.Extra.Skate.Status = "ol"
+ b, err := json.Marshal(release)
+ b = append(b, bytesNewline...)
+ return b, err
}
diff --git a/skate/cmd/skate-dot/main.go b/skate/cmd/skate-dot/main.go
index 3ef99d5..573209e 100644
--- a/skate/cmd/skate-dot/main.go
+++ b/skate/cmd/skate-dot/main.go
@@ -1,5 +1,6 @@
-// skate-dot generates dot files from inbound and outbound citation links. Just
-// a demo, replacement for a couple python scripts.
+// [wip] skate-dot generates dot files from inbound and outbound citation
+// links. Just a demo, replacement for a couple python scripts. We want things
+// like: https://git.io/JObzq.
package main
import (
diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go
index c2015e2..179057d 100644
--- a/skate/cmd/skate-from-unstructured/main.go
+++ b/skate/cmd/skate-from-unstructured/main.go
@@ -6,9 +6,7 @@ import (
"flag"
"log"
"os"
- "regexp"
"runtime"
- "strings"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
@@ -19,11 +17,6 @@ var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
bytesNewline = []byte("\n")
-
- PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
- PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
- PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
- PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
)
func main() {
@@ -32,7 +25,7 @@ func main() {
if err := json.Unmarshal(p, &ref); err != nil {
return nil, err
}
- if err := parseUnstructured(&ref); err != nil {
+ if err := skate.ParseUnstructured(&ref); err != nil {
return nil, err
}
return skate.JsonMarshalLine(&ref)
@@ -43,55 +36,3 @@ func main() {
log.Fatal(err)
}
}
-
-// parseUnstructured will in-place augment missing DOI, arxiv id and so on.
-func parseUnstructured(ref *skate.Ref) error {
- uns := ref.Biblio.Unstructured
- var (
- v string
- vs []string
- )
- // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
- // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
- if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
- parts := strings.Split(strings.ToLower(ref.Key), "-bib")
- ref.Biblio.DOI = parts[0]
- }
- // DOI
- v = PatDOI.FindString(uns)
- if v != "" && ref.Biblio.DOI == "" {
- ref.Biblio.DOI = v
- }
- // DOI in Key
- v = PatDOINoHyphen.FindString(ref.Key)
- if v != "" && ref.Biblio.DOI == "" {
- ref.Biblio.DOI = v
- }
- // DOI in URL
- prefixes := []string{
- "http://doi.org/",
- "https://doi.org/",
- "http://dx.doi.org/",
- "https://dx.doi.org/",
- }
- for _, prefix := range prefixes {
- if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
- ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
- }
- }
- v = PatDOINoHyphen.FindString(ref.Key)
- if v != "" && ref.Biblio.DOI == "" {
- ref.Biblio.DOI = v
- }
- // Arxiv
- vs = PatArxivPDF.FindStringSubmatch(uns)
- if len(vs) != 0 && ref.Biblio.ArxivId == "" {
- ref.Biblio.ArxivId = vs[1]
- } else {
- vs = PatArxivAbs.FindStringSubmatch(uns)
- if len(vs) != 0 && ref.Biblio.ArxivId == "" {
- ref.Biblio.ArxivId = vs[1]
- }
- }
- return nil
-}
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index ee02875..227acf2 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -1,9 +1,10 @@
-// skate-map runs a given map function over input data. We mostly want to
+// skate-map runs a given "map" function over input data. Here, we mostly want to
// extract a key from a json document. For simple cases, you can use `jq` and
-// other tools. Some key derivations require a bit more.
+// other tools. Some key derivations require a bit more, hence a dedicated program.
//
-// An example with mostly unix tools. We want to extract the DOI and sort by
-// it; we also want to do this fast, hence parallel, LC_ALL, etc.
+// An example with mostly unix tools. We want to extract the DOI from newline
+// delimited JSON and sort by it; we also want to do this fast, hence parallel,
+// LC_ALL, etc.
//
// $ zstdcat -T0 file.zst | (1)
// LC_ALL=C tr -d '\t' | (2) *
@@ -21,15 +22,15 @@
// be skipped, if we limit number of splits)
// (3) we pass the data to jq, with a bit larger buffer (default is 1MB)
// (4) we want no "null" output
-// (5) tostring prints input as string, because we need to carry the document forward
-// (6) but we need some cleanup, too
+// (5) tostring prints the input as string, because we need to carry the document forward ...
+// (6) ... but we'll need some cleanup, too
// (7) we normalize the DOI to lowercase
// (8) a custom filter to normalize a DOI in a specific column
// (9) sorting by DOI
//
// This is reasonably fast, but some cleanup is ugly. We also want more complex
-// keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8).
-
+// keys, e.g. more normalizations, etc; in short: we'd like to encapsulate (2)
+// to (8) with `skate-map`.
package main
import (
@@ -45,21 +46,26 @@ import (
)
var (
- mapperName = flag.String("m", "", "mapper to run")
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 50000, "batch size")
- verbose = flag.Bool("verbose", false, "show progress")
- keyPrefix = flag.String("p", "", "a key prefix to use")
- extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
+ mapperName = flag.String("m", "", "mapper to run")
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 50000, "batch size")
+ verbose = flag.Bool("verbose", false, "show progress")
+ keyPrefix = flag.String("p", "", "a key prefix to use")
+ extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
+ bestEffort = flag.Bool("B", false, "best effort")
+ logFile = flag.String("log", "", "log filename")
+ skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given column (zero indexed)")
+
+ help = `skate-map available mappers
+
+ $ skate-map -m ts < file.ndj > file.tsv
+ `
)
func main() {
flag.Parse()
- // TODO
- // [ ] add prefixes and a way to derive multiple keys in one go
- // [ ] how to store multiple keys, sorted?
- // [ ] maybe wrap jq and parallel for arbitrary nested keys
availableMappers := map[string]skate.Mapper{
+ // Add new mapper functions here.
"id": skate.Identity,
"ff": skate.CreateFixedMapper(*extraValue),
"ti": skate.MapperTitle,
@@ -67,15 +73,29 @@ func main() {
"ty": skate.MapperTitleNysiis,
"ts": skate.MapperTitleSandcrawler,
}
+ if *logFile != "" {
+ f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ log.SetOutput(f)
+ }
switch {
case *mapperName != "":
- if f, ok := availableMappers[*mapperName]; !ok {
+ if mapf, ok := availableMappers[*mapperName]; !ok {
log.Fatalf("unknown mapper name: %v", *mapperName)
} else {
+ if *skipOnEmpty >= 0 {
+ mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty)
+ }
if *keyPrefix != "" {
- f = skate.WithPrefix(f, *keyPrefix)
+ mapf = skate.WithPrefix(mapf, *keyPrefix)
+ }
+ if *bestEffort {
+ mapf = skate.WithBestEffort(mapf)
}
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, f.AsTSV)
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
pp.Verbose = *verbose
@@ -84,8 +104,7 @@ func main() {
}
}
default:
- fmt.Println("skate-map available mappers")
- fmt.Println()
+ fmt.Println(help)
w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0)
for k, v := range availableMappers {
fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v))
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index d1a21e9..c4fdb1e 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -1,3 +1,4 @@
+// skate-wikipedia-doi extracts DOI from wikipedia reference dataset.
package main
import (