diff options
Diffstat (limited to 'skate/cmd/skate-cleanup')
| -rw-r--r-- | skate/cmd/skate-cleanup/main.go | 93 | 
1 files changed, 93 insertions, 0 deletions
| diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go new file mode 100644 index 0000000..e1c7f3c --- /dev/null +++ b/skate/cmd/skate-cleanup/main.go @@ -0,0 +1,93 @@ +// Filter to parse out a correctly looking DOI, URL, etc from a field. +// +// $ echo "1,xxx 10.123/12312 xxx,3" | skate-to-doi -c doi -d , -f 2 +// 1,10.123/12312,3k +// +// We can use this to sanitize fields in the reference dataset. + +package main + +import ( +	"flag" +	"fmt" +	"log" +	"os" +	"regexp" +	"runtime" +	"strings" + +	"git.archive.org/martin/cgraph/skate/parallel" +	"mvdan.cc/xurls/v2" +) + +var ( +	numWorkers     = flag.Int("w", runtime.NumCPU(), "number of workers") +	batchSize      = flag.Int("b", 100000, "batch size") +	delimiter      = flag.String("d", "\t", "delimiter") +	index          = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") +	bestEffort     = flag.Bool("B", false, "only log errors, but do not stop") +	skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") +	what           = flag.String("c", "doi", "what to clean: doi, url") + +	PatDOI    = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) +	rxRelaxed = xurls.Relaxed() +) + +func main() { +	flag.Parse() +	var f func([]byte) ([]byte, error) +	switch *what { +	case "doi": +		f = doiFilter +	case "url": +		f = urlFilter +	default: +		f = doiFilter +	} +	pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) +	pp.NumWorkers = *numWorkers +	pp.BatchSize = *batchSize +	if err := pp.Run(); err != nil { +		log.Fatal(err) +	} +} + +// urlFilter parses finds the first URL. +func urlFilter(p []byte) ([]byte, error) { +	parts := strings.Split(string(p), *delimiter) +	if len(parts) < *index { +		msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) +		if *bestEffort { +			log.Println(msg) +			return nil, nil +		} else { +			return nil, fmt.Errorf(msg) +		} +	} +	url := rxRelaxed.FindString(parts[*index-1]) +	if url == "" && *skipNonMatches { +		return nil, nil +	} +	parts[*index-1] = url +	return []byte(strings.Join(parts, *delimiter)), nil +} + +// doiFilter finds a DOI +func doiFilter(p []byte) ([]byte, error) { +	parts := strings.Split(string(p), *delimiter) +	if len(parts) < *index { +		msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) +		if *bestEffort { +			log.Println(msg) +			return nil, nil +		} else { +			return nil, fmt.Errorf(msg) +		} +	} +	result := PatDOI.FindString(parts[*index-1]) +	if result == "" && *skipNonMatches { +		return nil, nil +	} +	parts[*index-1] = result +	return []byte(strings.Join(parts, *delimiter)), nil +} | 
