aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-30 01:35:58 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-03-30 01:35:58 +0200
commite8ed1ff2a60b694b242669a50c5a37346f3b6d79 (patch)
tree28eeed2962722ff9c63cb7db80cf559b68fd3b97
parent956476225837ad9ccf30c9698806e3fd959b75ef (diff)
downloadrefcat-e8ed1ff2a60b694b242669a50c5a37346f3b6d79.tar.gz
refcat-e8ed1ff2a60b694b242669a50c5a37346f3b6d79.zip
stub wikipedia converter
-rw-r--r--python/notes/version_3.md6
-rw-r--r--python/refcat/tasks.py24
-rw-r--r--skate/.gitignore1
-rw-r--r--skate/Makefile2
-rw-r--r--skate/cmd/skate-biblioref-from-wikipedia/main.go35
-rw-r--r--skate/schema.go8
6 files changed, 75 insertions, 1 deletions
diff --git a/python/notes/version_3.md b/python/notes/version_3.md
index 4f165f0..e4794a6 100644
--- a/python/notes/version_3.md
+++ b/python/notes/version_3.md
@@ -199,3 +199,9 @@ $ time zstdcat -T0 /magna/refcat/UnmatchedRefs/date-2021-02-20.json.zst | LC_ALL
260768384
```
+----
+
+# Wikipedia
+
+* /magna/data/wikipedia_citations_2020-07-14
+
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 4851f2b..df56b9d 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -177,6 +177,20 @@ class MAGPapers(luigi.ExternalTask, Refcat):
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
+class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
+ """
+ From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia
+ Citations: A comprehensive dataset of citations with identifiers extracted
+ from English Wikipedia).
+
+ Dataset contains parquet, but we want JSON here:
+
+ $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json
+ """
+ def output(self):
+ return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
+
+
# ----8< Derivations
class RefsWithUnstructured(Refcat):
@@ -1412,3 +1426,13 @@ class MAGDOI(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+# ==== WikipediaCitations
+
+class BiblioRefWikipediaCitations(Refcat):
+ """
+ Generate a biblioref schema from wikipedia citations minimal file.
+ """
+
+ def requires(self):
+ return WikipediaCitationsMinimalDataset()
diff --git a/skate/.gitignore b/skate/.gitignore
index 9e9e00c..723853e 100644
--- a/skate/.gitignore
+++ b/skate/.gitignore
@@ -24,6 +24,7 @@
/skate-to-doi
/skate-bref-id
/skate-from-unstructured
+/skate-biblioref-from-wikipedia
packaging/debian/skate/usr
skate_*_amd64.deb
diff --git a/skate/Makefile b/skate/Makefile
index 747a478..3574f4d 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-biblioref skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured
+TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-biblioref skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-biblioref-from-wikipedia
PKGNAME := skate
.PHONY: test
diff --git a/skate/cmd/skate-biblioref-from-wikipedia/main.go b/skate/cmd/skate-biblioref-from-wikipedia/main.go
new file mode 100644
index 0000000..552b625
--- /dev/null
+++ b/skate/cmd/skate-biblioref-from-wikipedia/main.go
@@ -0,0 +1,35 @@
+package main
+
+import (
+ "flag"
+ "log"
+ "os"
+ "runtime"
+
+ "git.archive.org/martin/cgraph/skate"
+ "git.archive.org/martin/cgraph/skate/parallel"
+ jsoniter "github.com/json-iterator/go"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 100000, "batch size")
+ json = jsoniter.ConfigCompatibleWithStandardLibrary
+ bytesNewline = []byte("\n")
+)
+
+func main() {
+ flag.Parse()
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
+ var w skate.MinimalCitations
+ if err := json.Unmarshal(p, &w); err != nil {
+ return nil, err
+ }
+ return nil, nil
+ })
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
diff --git a/skate/schema.go b/skate/schema.go
index d0c6833..ff59b61 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -253,3 +253,11 @@ type Group struct {
func (g *Group) String() string {
return fmt.Sprintf("<Group A/B %d/%d>", len(g.A), len(g.B))
}
+
+// MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14.
+type MinimalCitations struct {
+ IDList string `json:"ID_list"`
+ PageTitle string `json:"page_title"`
+ Title string `json:"Title"`
+ TypeOfCitation string `json:"type_of_citation"`
+}