1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
package skate
import (
"fmt"
"log"
"strings"
"github.com/segmentio/encoding/json"
"mvdan.cc/xurls/v2"
)
// TODO: This should be revisited entirely.
var rxRelaxed = xurls.Relaxed()
// URLFilter is a line oriented URL filter.
type FilterURL struct {
Delimiter string
Index int
BestEffort bool
Aggressive bool
SkipNonMatches bool
AllowedSchemas []string
}
// Run executes the filter on a blob of data, most likely a line.
func (f *FilterURL) Run(p []byte) ([]byte, error) {
parts := strings.Split(string(p), f.Delimiter)
if len(parts) < f.Index {
msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
if f.BestEffort {
log.Println(msg)
return nil, nil
} else {
return nil, fmt.Errorf(msg)
}
}
url := rxRelaxed.FindString(parts[f.Index-1])
if f.Aggressive {
url = SanitizeURL(url)
}
if url == "" && f.SkipNonMatches {
return nil, nil
}
if len(f.AllowedSchemas) > 0 && !HasAnyPrefix(url, f.AllowedSchemas) {
return nil, nil
}
if len(parts) == 1 || f.Index == len(parts) {
url = url + "\n"
}
parts[f.Index-1] = url
return []byte(strings.Join(parts, f.Delimiter)), nil
}
// FilterDOI is a line oriented DOI filter.
type FilterDOI struct {
Delimiter string
Index int
BestEffort bool
Aggressive bool
SkipNonMatches bool
AllowedSchema []string
}
// Run executes the filter on a blob of data, most likely a line.
func (f *FilterDOI) Run(p []byte) ([]byte, error) {
parts := strings.Split(string(p), f.Delimiter)
if len(parts) < f.Index {
msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
if f.BestEffort {
log.Println(msg)
return nil, nil
} else {
return nil, fmt.Errorf(msg)
}
}
doi := PatDOI.FindString(parts[f.Index-1])
if doi == "" && f.SkipNonMatches {
return nil, nil
}
parts[f.Index-1] = strings.ToLower(doi)
return []byte(strings.Join(parts, f.Delimiter)), nil
}
// FilterRawRef is an ad-hoc filter.
type FilterRawRef struct{}
// Run executes the filter. TODO: Gather cleanup functions together and make
// them more easily shared.
func (f *FilterRawRef) Run(p []byte) ([]byte, error) {
var ref Ref
if err := json.Unmarshal(p, &ref); err != nil {
return nil, err
}
if strings.Contains(ref.Biblio.Unstructured, "................") {
return nil, nil
}
return p, nil
}
|