diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-11 19:50:53 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-11 19:50:53 +0200 |
commit | da83d4584c8f131cb3ad80ed2928a9fd033af5f9 (patch) | |
tree | a3935087dd8815ca953ea2fd24740a1e1f4a7670 /skate/cmd/skate-cleanup | |
parent | 4016f2b50bf7b22eeb9eb41cf83d07bf59e8d7b3 (diff) | |
download | refcat-da83d4584c8f131cb3ad80ed2928a9fd033af5f9.tar.gz refcat-da83d4584c8f131cb3ad80ed2928a9fd033af5f9.zip |
rename: skate-to-doi to skate-cleanup
Diffstat (limited to 'skate/cmd/skate-cleanup')
-rw-r--r-- | skate/cmd/skate-cleanup/main.go | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go new file mode 100644 index 0000000..e1c7f3c --- /dev/null +++ b/skate/cmd/skate-cleanup/main.go @@ -0,0 +1,93 @@ +// Filter to parse out a correctly looking DOI, URL, etc from a field. +// +// $ echo "1,xxx 10.123/12312 xxx,3" | skate-to-doi -c doi -d , -f 2 +// 1,10.123/12312,3k +// +// We can use this to sanitize fields in the reference dataset. + +package main + +import ( + "flag" + "fmt" + "log" + "os" + "regexp" + "runtime" + "strings" + + "git.archive.org/martin/cgraph/skate/parallel" + "mvdan.cc/xurls/v2" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + delimiter = flag.String("d", "\t", "delimiter") + index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") + bestEffort = flag.Bool("B", false, "only log errors, but do not stop") + skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") + what = flag.String("c", "doi", "what to clean: doi, url") + + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + rxRelaxed = xurls.Relaxed() +) + +func main() { + flag.Parse() + var f func([]byte) ([]byte, error) + switch *what { + case "doi": + f = doiFilter + case "url": + f = urlFilter + default: + f = doiFilter + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +// urlFilter parses finds the first URL. +func urlFilter(p []byte) ([]byte, error) { + parts := strings.Split(string(p), *delimiter) + if len(parts) < *index { + msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) + if *bestEffort { + log.Println(msg) + return nil, nil + } else { + return nil, fmt.Errorf(msg) + } + } + url := rxRelaxed.FindString(parts[*index-1]) + if url == "" && *skipNonMatches { + return nil, nil + } + parts[*index-1] = url + return []byte(strings.Join(parts, *delimiter)), nil +} + +// doiFilter finds a DOI +func doiFilter(p []byte) ([]byte, error) { + parts := strings.Split(string(p), *delimiter) + if len(parts) < *index { + msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) + if *bestEffort { + log.Println(msg) + return nil, nil + } else { + return nil, fmt.Errorf(msg) + } + } + result := PatDOI.FindString(parts[*index-1]) + if result == "" && *skipNonMatches { + return nil, nil + } + parts[*index-1] = result + return []byte(strings.Join(parts, *delimiter)), nil +} |