aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cleanup.go
blob: 4a10063c7195221f7d9eddf3e633c2e2d01a47a2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package skate

import (
	"fmt"
	"log"
	"strings"

	"github.com/segmentio/encoding/json"
	"mvdan.cc/xurls/v2"
)

// TODO: This should be revisited entirely.

var rxRelaxed = xurls.Relaxed()

// URLFilter is a line oriented URL filter.
type FilterURL struct {
	Delimiter      string
	Index          int
	BestEffort     bool
	Aggressive     bool
	SkipNonMatches bool
	AllowedSchemas []string
}

// Run executes the filter on a blob of data, most likely a line.
func (f *FilterURL) Run(p []byte) ([]byte, error) {
	parts := strings.Split(string(p), f.Delimiter)
	if len(parts) < f.Index {
		msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
		if f.BestEffort {
			log.Println(msg)
			return nil, nil
		} else {
			return nil, fmt.Errorf(msg)
		}
	}
	url := rxRelaxed.FindString(parts[f.Index-1])
	if f.Aggressive {
		url = SanitizeURL(url)
	}
	if url == "" && f.SkipNonMatches {
		return nil, nil
	}
	if len(f.AllowedSchemas) > 0 && !HasAnyPrefix(url, f.AllowedSchemas) {
		return nil, nil
	}
	if len(parts) == 1 || f.Index == len(parts) {
		url = url + "\n"
	}
	parts[f.Index-1] = url
	return []byte(strings.Join(parts, f.Delimiter)), nil
}

// FilterDOI is a line oriented DOI filter.
type FilterDOI struct {
	Delimiter      string
	Index          int
	BestEffort     bool
	Aggressive     bool
	SkipNonMatches bool
	AllowedSchema  []string
}

// Run executes the filter on a blob of data, most likely a line.
func (f *FilterDOI) Run(p []byte) ([]byte, error) {
	parts := strings.Split(string(p), f.Delimiter)
	if len(parts) < f.Index {
		msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
		if f.BestEffort {
			log.Println(msg)
			return nil, nil
		} else {
			return nil, fmt.Errorf(msg)
		}
	}
	doi := PatDOI.FindString(parts[f.Index-1])
	if doi == "" && f.SkipNonMatches {
		return nil, nil
	}
	parts[f.Index-1] = strings.ToLower(doi)
	return []byte(strings.Join(parts, f.Delimiter)), nil
}

// FilterRawRef is an ad-hoc filter.
type FilterRawRef struct{}

// Run executes the filter. TODO: Gather cleanup functions together and make
// them more easily shared.
func (f *FilterRawRef) Run(p []byte) ([]byte, error) {
	var ref Ref
	if err := json.Unmarshal(p, &ref); err != nil {
		return nil, err
	}
	if strings.Contains(ref.Biblio.Unstructured, "................") {
		return nil, nil
	}
	return p, nil
}