aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/.gitignore1
-rw-r--r--skate/Makefile2
-rw-r--r--skate/cmd/skate-biblioref-from-wikipedia/main.go35
-rw-r--r--skate/schema.go8
4 files changed, 45 insertions, 1 deletions
diff --git a/skate/.gitignore b/skate/.gitignore
index 9e9e00c..723853e 100644
--- a/skate/.gitignore
+++ b/skate/.gitignore
@@ -24,6 +24,7 @@
/skate-to-doi
/skate-bref-id
/skate-from-unstructured
+/skate-biblioref-from-wikipedia
packaging/debian/skate/usr
skate_*_amd64.deb
diff --git a/skate/Makefile b/skate/Makefile
index 747a478..3574f4d 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-biblioref skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured
+TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-biblioref skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-biblioref-from-wikipedia
PKGNAME := skate
.PHONY: test
diff --git a/skate/cmd/skate-biblioref-from-wikipedia/main.go b/skate/cmd/skate-biblioref-from-wikipedia/main.go
new file mode 100644
index 0000000..552b625
--- /dev/null
+++ b/skate/cmd/skate-biblioref-from-wikipedia/main.go
@@ -0,0 +1,35 @@
+package main
+
+import (
+ "flag"
+ "log"
+ "os"
+ "runtime"
+
+ "git.archive.org/martin/cgraph/skate"
+ "git.archive.org/martin/cgraph/skate/parallel"
+ jsoniter "github.com/json-iterator/go"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 100000, "batch size")
+ json = jsoniter.ConfigCompatibleWithStandardLibrary
+ bytesNewline = []byte("\n")
+)
+
+func main() {
+ flag.Parse()
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
+ var w skate.MinimalCitations
+ if err := json.Unmarshal(p, &w); err != nil {
+ return nil, err
+ }
+ return nil, nil
+ })
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
diff --git a/skate/schema.go b/skate/schema.go
index d0c6833..ff59b61 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -253,3 +253,11 @@ type Group struct {
func (g *Group) String() string {
return fmt.Sprintf("<Group A/B %d/%d>", len(g.A), len(g.B))
}
+
+// MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14.
+type MinimalCitations struct {
+ IDList string `json:"ID_list"`
+ PageTitle string `json:"page_title"`
+ Title string `json:"Title"`
+ TypeOfCitation string `json:"type_of_citation"`
+}