diff options
-rw-r--r-- | python/refcat/tasks.py | 25 | ||||
-rw-r--r-- | skate/.gitignore | 2 | ||||
-rw-r--r-- | skate/Makefile | 2 | ||||
-rw-r--r-- | skate/cmd/skate-cleanup/main.go | 93 | ||||
-rw-r--r-- | skate/cmd/skate-to-doi/main.go | 58 | ||||
-rw-r--r-- | skate/go.mod | 1 | ||||
-rw-r--r-- | skate/go.sum | 9 |
7 files changed, 129 insertions, 61 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 915d406..f21fade 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -295,6 +295,29 @@ class URLTabs(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +class URLList(Refcat): + """ + List of cleaned URLs from refs. + """ + def requires(self): + return URLList() + + def run(self): + output = shellout(""" + zstdcat -T0 {input} | + cut -f 3 | + skate-cleanup -c url -B -S -f 1 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + zstd -T0 -c > {output} + """, + n=self.n, + tmpdir=self.tmpdir, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + class RefsDOI(Refcat): """ Sorted (doi, doc) tuples from refs. 225m48.755s @@ -306,7 +329,7 @@ class RefsDOI(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x biblio.doi -skip-on-empty 1 | - skate-to-doi -f 1 | + skate-cleanup -c doi -f 1 | LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | zstd -T0 -c > {output} """, diff --git a/skate/.gitignore b/skate/.gitignore index 2dc8d4c..219bb28 100644 --- a/skate/.gitignore +++ b/skate/.gitignore @@ -15,13 +15,13 @@ # vendor/ # /skate-bref-id +/skate-cleanup /skate-cluster /skate-conv /skate-dot /skate-from-unstructured /skate-map /skate-reduce -/skate-to-doi /skate-wikipedia-doi packaging/debian/skate/usr skate_*_amd64.deb diff --git a/skate/Makefile b/skate/Makefile index 97b192d..fa8bb92 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-conv skate-cluster skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce +TARGETS := skate-conv skate-cluster skate-cleanup skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce PKGNAME := skate .PHONY: test diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go new file mode 100644 index 0000000..e1c7f3c --- /dev/null +++ b/skate/cmd/skate-cleanup/main.go @@ -0,0 +1,93 @@ +// Filter to parse out a correctly looking DOI, URL, etc from a field. +// +// $ echo "1,xxx 10.123/12312 xxx,3" | skate-to-doi -c doi -d , -f 2 +// 1,10.123/12312,3k +// +// We can use this to sanitize fields in the reference dataset. + +package main + +import ( + "flag" + "fmt" + "log" + "os" + "regexp" + "runtime" + "strings" + + "git.archive.org/martin/cgraph/skate/parallel" + "mvdan.cc/xurls/v2" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + delimiter = flag.String("d", "\t", "delimiter") + index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") + bestEffort = flag.Bool("B", false, "only log errors, but do not stop") + skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") + what = flag.String("c", "doi", "what to clean: doi, url") + + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + rxRelaxed = xurls.Relaxed() +) + +func main() { + flag.Parse() + var f func([]byte) ([]byte, error) + switch *what { + case "doi": + f = doiFilter + case "url": + f = urlFilter + default: + f = doiFilter + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +// urlFilter parses finds the first URL. +func urlFilter(p []byte) ([]byte, error) { + parts := strings.Split(string(p), *delimiter) + if len(parts) < *index { + msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) + if *bestEffort { + log.Println(msg) + return nil, nil + } else { + return nil, fmt.Errorf(msg) + } + } + url := rxRelaxed.FindString(parts[*index-1]) + if url == "" && *skipNonMatches { + return nil, nil + } + parts[*index-1] = url + return []byte(strings.Join(parts, *delimiter)), nil +} + +// doiFilter finds a DOI +func doiFilter(p []byte) ([]byte, error) { + parts := strings.Split(string(p), *delimiter) + if len(parts) < *index { + msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) + if *bestEffort { + log.Println(msg) + return nil, nil + } else { + return nil, fmt.Errorf(msg) + } + } + result := PatDOI.FindString(parts[*index-1]) + if result == "" && *skipNonMatches { + return nil, nil + } + parts[*index-1] = result + return []byte(strings.Join(parts, *delimiter)), nil +} diff --git a/skate/cmd/skate-to-doi/main.go b/skate/cmd/skate-to-doi/main.go deleted file mode 100644 index 377383f..0000000 --- a/skate/cmd/skate-to-doi/main.go +++ /dev/null @@ -1,58 +0,0 @@ -// Filter to parse out a correctly looking DOI from a field. -// -// $ echo "1,xxx 10.123/12312 xxx,3" | skate-to-doi -d , -f 2 -// 1,10.123/12312,3 -// -// We can use this to sanitize DOI-like fields in the reference dataset. - -package main - -import ( - "flag" - "fmt" - "log" - "os" - "regexp" - "runtime" - "strings" - - "git.archive.org/martin/cgraph/skate/parallel" -) - -var ( - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 100000, "batch size") - delimiter = flag.String("d", "\t", "delimiter") - index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") - bestEffort = flag.Bool("B", false, "only log errors, but do not stop") - skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") - - PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) -) - -func main() { - flag.Parse() - pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { - parts := strings.Split(string(p), *delimiter) - if len(parts) < *index { - msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) - if *bestEffort { - log.Println(msg) - return nil, nil - } else { - return nil, fmt.Errorf(msg) - } - } - result := PatDOI.FindString(parts[*index-1]) - if len(result) == 0 && *skipNonMatches { - return nil, nil - } - parts[*index-1] = result - return []byte(strings.Join(parts, *delimiter)), nil - }) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } -} diff --git a/skate/go.mod b/skate/go.mod index 57ae586..d6b668b 100644 --- a/skate/go.mod +++ b/skate/go.mod @@ -11,4 +11,5 @@ require ( github.com/segmentio/encoding v0.2.17 github.com/tidwall/gjson v1.7.5 golang.org/x/text v0.3.6 + mvdan.cc/xurls/v2 v2.2.0 ) diff --git a/skate/go.sum b/skate/go.sum index 96d323d..902db2f 100644 --- a/skate/go.sum +++ b/skate/go.sum @@ -5,10 +5,14 @@ github.com/elastic/go-elasticsearch/v7 v7.12.0/go.mod h1:OJ4wdbtDNk5g503kvlHLyEr github.com/klauspost/cpuid/v2 v2.0.5/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE= github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU= github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e h1:S+/ptYdZtpK/MDstwCyt+ZHdXEpz86RJZ5gyZU4txJY= github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e/go.mod h1:uFMI8w+ref4v2r9jz+c9i1IfIttS/OkmLfrk1jne5hs= +github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/segmentio/encoding v0.2.17 h1:cgfmPc44u1po1lz5bSgF00gLCROBjDNc7h+H7I20zpc= github.com/segmentio/encoding v0.2.17/go.mod h1:7E68jTSWMnNoYhHi1JbLd7NBSB6XfE4vzqhR88hDBQc= github.com/tidwall/gjson v1.7.5 h1:zmAN/xmX7OtpAkv4Ovfso60r/BiCi5IErCDYGNJu+uc= @@ -20,3 +24,8 @@ github.com/tidwall/pretty v1.1.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhV golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +mvdan.cc/xurls/v2 v2.2.0 h1:NSZPykBXJFCetGZykLAxaL6SIpvbVy/UFEniIfHAa8A= +mvdan.cc/xurls/v2 v2.2.0/go.mod h1:EV1RMtya9D6G5DMYPGD8zTQzaHet6Jh8gFlRgGRJeO8= |