diff options
-rw-r--r-- | python/notes/version_3.md | 6 | ||||
-rw-r--r-- | python/refcat/tasks.py | 24 | ||||
-rw-r--r-- | skate/.gitignore | 1 | ||||
-rw-r--r-- | skate/Makefile | 2 | ||||
-rw-r--r-- | skate/cmd/skate-biblioref-from-wikipedia/main.go | 35 | ||||
-rw-r--r-- | skate/schema.go | 8 |
6 files changed, 75 insertions, 1 deletions
diff --git a/python/notes/version_3.md b/python/notes/version_3.md index 4f165f0..e4794a6 100644 --- a/python/notes/version_3.md +++ b/python/notes/version_3.md @@ -199,3 +199,9 @@ $ time zstdcat -T0 /magna/refcat/UnmatchedRefs/date-2021-02-20.json.zst | LC_ALL 260768384 ``` +---- + +# Wikipedia + +* /magna/data/wikipedia_citations_2020-07-14 + diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 4851f2b..df56b9d 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -177,6 +177,20 @@ class MAGPapers(luigi.ExternalTask, Refcat): def output(self): return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd) +class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): + """ + From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia + Citations: A comprehensive dataset of citations with identifiers extracted + from English Wikipedia). + + Dataset contains parquet, but we want JSON here: + + $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json + """ + def output(self): + return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) + + # ----8< Derivations class RefsWithUnstructured(Refcat): @@ -1412,3 +1426,13 @@ class MAGDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + +# ==== WikipediaCitations + +class BiblioRefWikipediaCitations(Refcat): + """ + Generate a biblioref schema from wikipedia citations minimal file. + """ + + def requires(self): + return WikipediaCitationsMinimalDataset() diff --git a/skate/.gitignore b/skate/.gitignore index 9e9e00c..723853e 100644 --- a/skate/.gitignore +++ b/skate/.gitignore @@ -24,6 +24,7 @@ /skate-to-doi /skate-bref-id /skate-from-unstructured +/skate-biblioref-from-wikipedia packaging/debian/skate/usr skate_*_amd64.deb diff --git a/skate/Makefile b/skate/Makefile index 747a478..3574f4d 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-biblioref skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured +TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-biblioref skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-biblioref-from-wikipedia PKGNAME := skate .PHONY: test diff --git a/skate/cmd/skate-biblioref-from-wikipedia/main.go b/skate/cmd/skate-biblioref-from-wikipedia/main.go new file mode 100644 index 0000000..552b625 --- /dev/null +++ b/skate/cmd/skate-biblioref-from-wikipedia/main.go @@ -0,0 +1,35 @@ +package main + +import ( + "flag" + "log" + "os" + "runtime" + + "git.archive.org/martin/cgraph/skate" + "git.archive.org/martin/cgraph/skate/parallel" + jsoniter "github.com/json-iterator/go" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + json = jsoniter.ConfigCompatibleWithStandardLibrary + bytesNewline = []byte("\n") +) + +func main() { + flag.Parse() + pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { + var w skate.MinimalCitations + if err := json.Unmarshal(p, &w); err != nil { + return nil, err + } + return nil, nil + }) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} diff --git a/skate/schema.go b/skate/schema.go index d0c6833..ff59b61 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -253,3 +253,11 @@ type Group struct { func (g *Group) String() string { return fmt.Sprintf("<Group A/B %d/%d>", len(g.A), len(g.B)) } + +// MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14. +type MinimalCitations struct { + IDList string `json:"ID_list"` + PageTitle string `json:"page_title"` + Title string `json:"Title"` + TypeOfCitation string `json:"type_of_citation"` +} |