aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-11 19:50:53 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-11 19:50:53 +0200
commitda83d4584c8f131cb3ad80ed2928a9fd033af5f9 (patch)
treea3935087dd8815ca953ea2fd24740a1e1f4a7670
parent4016f2b50bf7b22eeb9eb41cf83d07bf59e8d7b3 (diff)
downloadrefcat-da83d4584c8f131cb3ad80ed2928a9fd033af5f9.tar.gz
refcat-da83d4584c8f131cb3ad80ed2928a9fd033af5f9.zip
rename: skate-to-doi to skate-cleanup
-rw-r--r--python/refcat/tasks.py25
-rw-r--r--skate/.gitignore2
-rw-r--r--skate/Makefile2
-rw-r--r--skate/cmd/skate-cleanup/main.go93
-rw-r--r--skate/cmd/skate-to-doi/main.go58
-rw-r--r--skate/go.mod1
-rw-r--r--skate/go.sum9
7 files changed, 129 insertions, 61 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 915d406..f21fade 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -295,6 +295,29 @@ class URLTabs(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+class URLList(Refcat):
+ """
+ List of cleaned URLs from refs.
+ """
+ def requires(self):
+ return URLList()
+
+ def run(self):
+ output = shellout("""
+ zstdcat -T0 {input} |
+ cut -f 3 |
+ skate-cleanup -c url -B -S -f 1 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ zstd -T0 -c > {output}
+ """,
+ n=self.n,
+ tmpdir=self.tmpdir,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
class RefsDOI(Refcat):
"""
Sorted (doi, doc) tuples from refs. 225m48.755s
@@ -306,7 +329,7 @@ class RefsDOI(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x biblio.doi -skip-on-empty 1 |
- skate-to-doi -f 1 |
+ skate-cleanup -c doi -f 1 |
LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
zstd -T0 -c > {output}
""",
diff --git a/skate/.gitignore b/skate/.gitignore
index 2dc8d4c..219bb28 100644
--- a/skate/.gitignore
+++ b/skate/.gitignore
@@ -15,13 +15,13 @@
# vendor/
#
/skate-bref-id
+/skate-cleanup
/skate-cluster
/skate-conv
/skate-dot
/skate-from-unstructured
/skate-map
/skate-reduce
-/skate-to-doi
/skate-wikipedia-doi
packaging/debian/skate/usr
skate_*_amd64.deb
diff --git a/skate/Makefile b/skate/Makefile
index 97b192d..fa8bb92 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-conv skate-cluster skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce
+TARGETS := skate-conv skate-cluster skate-cleanup skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce
PKGNAME := skate
.PHONY: test
diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go
new file mode 100644
index 0000000..e1c7f3c
--- /dev/null
+++ b/skate/cmd/skate-cleanup/main.go
@@ -0,0 +1,93 @@
+// Filter to parse out a correctly looking DOI, URL, etc from a field.
+//
+// $ echo "1,xxx 10.123/12312 xxx,3" | skate-to-doi -c doi -d , -f 2
+// 1,10.123/12312,3k
+//
+// We can use this to sanitize fields in the reference dataset.
+
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "os"
+ "regexp"
+ "runtime"
+ "strings"
+
+ "git.archive.org/martin/cgraph/skate/parallel"
+ "mvdan.cc/xurls/v2"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 100000, "batch size")
+ delimiter = flag.String("d", "\t", "delimiter")
+ index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed")
+ bestEffort = flag.Bool("B", false, "only log errors, but do not stop")
+ skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches")
+ what = flag.String("c", "doi", "what to clean: doi, url")
+
+ PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+ rxRelaxed = xurls.Relaxed()
+)
+
+func main() {
+ flag.Parse()
+ var f func([]byte) ([]byte, error)
+ switch *what {
+ case "doi":
+ f = doiFilter
+ case "url":
+ f = urlFilter
+ default:
+ f = doiFilter
+ }
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
+
+// urlFilter parses finds the first URL.
+func urlFilter(p []byte) ([]byte, error) {
+ parts := strings.Split(string(p), *delimiter)
+ if len(parts) < *index {
+ msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
+ if *bestEffort {
+ log.Println(msg)
+ return nil, nil
+ } else {
+ return nil, fmt.Errorf(msg)
+ }
+ }
+ url := rxRelaxed.FindString(parts[*index-1])
+ if url == "" && *skipNonMatches {
+ return nil, nil
+ }
+ parts[*index-1] = url
+ return []byte(strings.Join(parts, *delimiter)), nil
+}
+
+// doiFilter finds a DOI
+func doiFilter(p []byte) ([]byte, error) {
+ parts := strings.Split(string(p), *delimiter)
+ if len(parts) < *index {
+ msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
+ if *bestEffort {
+ log.Println(msg)
+ return nil, nil
+ } else {
+ return nil, fmt.Errorf(msg)
+ }
+ }
+ result := PatDOI.FindString(parts[*index-1])
+ if result == "" && *skipNonMatches {
+ return nil, nil
+ }
+ parts[*index-1] = result
+ return []byte(strings.Join(parts, *delimiter)), nil
+}
diff --git a/skate/cmd/skate-to-doi/main.go b/skate/cmd/skate-to-doi/main.go
deleted file mode 100644
index 377383f..0000000
--- a/skate/cmd/skate-to-doi/main.go
+++ /dev/null
@@ -1,58 +0,0 @@
-// Filter to parse out a correctly looking DOI from a field.
-//
-// $ echo "1,xxx 10.123/12312 xxx,3" | skate-to-doi -d , -f 2
-// 1,10.123/12312,3
-//
-// We can use this to sanitize DOI-like fields in the reference dataset.
-
-package main
-
-import (
- "flag"
- "fmt"
- "log"
- "os"
- "regexp"
- "runtime"
- "strings"
-
- "git.archive.org/martin/cgraph/skate/parallel"
-)
-
-var (
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 100000, "batch size")
- delimiter = flag.String("d", "\t", "delimiter")
- index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed")
- bestEffort = flag.Bool("B", false, "only log errors, but do not stop")
- skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches")
-
- PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
-)
-
-func main() {
- flag.Parse()
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
- parts := strings.Split(string(p), *delimiter)
- if len(parts) < *index {
- msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
- if *bestEffort {
- log.Println(msg)
- return nil, nil
- } else {
- return nil, fmt.Errorf(msg)
- }
- }
- result := PatDOI.FindString(parts[*index-1])
- if len(result) == 0 && *skipNonMatches {
- return nil, nil
- }
- parts[*index-1] = result
- return []byte(strings.Join(parts, *delimiter)), nil
- })
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
-}
diff --git a/skate/go.mod b/skate/go.mod
index 57ae586..d6b668b 100644
--- a/skate/go.mod
+++ b/skate/go.mod
@@ -11,4 +11,5 @@ require (
github.com/segmentio/encoding v0.2.17
github.com/tidwall/gjson v1.7.5
golang.org/x/text v0.3.6
+ mvdan.cc/xurls/v2 v2.2.0
)
diff --git a/skate/go.sum b/skate/go.sum
index 96d323d..902db2f 100644
--- a/skate/go.sum
+++ b/skate/go.sum
@@ -5,10 +5,14 @@ github.com/elastic/go-elasticsearch/v7 v7.12.0/go.mod h1:OJ4wdbtDNk5g503kvlHLyEr
github.com/klauspost/cpuid/v2 v2.0.5/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI=
github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE=
github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e h1:S+/ptYdZtpK/MDstwCyt+ZHdXEpz86RJZ5gyZU4txJY=
github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e/go.mod h1:uFMI8w+ref4v2r9jz+c9i1IfIttS/OkmLfrk1jne5hs=
+github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/segmentio/encoding v0.2.17 h1:cgfmPc44u1po1lz5bSgF00gLCROBjDNc7h+H7I20zpc=
github.com/segmentio/encoding v0.2.17/go.mod h1:7E68jTSWMnNoYhHi1JbLd7NBSB6XfE4vzqhR88hDBQc=
github.com/tidwall/gjson v1.7.5 h1:zmAN/xmX7OtpAkv4Ovfso60r/BiCi5IErCDYGNJu+uc=
@@ -20,3 +24,8 @@ github.com/tidwall/pretty v1.1.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhV
golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
+mvdan.cc/xurls/v2 v2.2.0 h1:NSZPykBXJFCetGZykLAxaL6SIpvbVy/UFEniIfHAa8A=
+mvdan.cc/xurls/v2 v2.2.0/go.mod h1:EV1RMtya9D6G5DMYPGD8zTQzaHet6Jh8gFlRgGRJeO8=