blob: 8f6049e05877f818cbf4e14f0a8e1cbb8f6186c5 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
package skate
import (
"strings"
)
// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a
// re-implementation of the simple 'clean_doi()' python function.
// It should handle DOI URLs, prefixes, and some forms of mangling, though it
// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled
// suffixes).
// At least lower-cases all DOIs, for more permissive matching.
// Does not validate or convert non-ASCII characters.
// Intended to be performant and used liberally; does not execute any regexes.
// Returns empty string if the input is definitely not a DOI, though is
// relatively permissive and does little validation.
func SanitizeDOI(raw string) string {
// short-circuits
if len(raw) < 8 || !strings.Contains(raw, "10.") {
return ""
}
// lower-case and trim whitespace
raw = strings.ToLower(strings.TrimSpace(raw))
// if doesn't start with 10., strip any prefix
start := strings.Index(raw, "10.")
if start == -1 {
return ""
} else if start > 0 {
raw = raw[start:len(raw)]
}
// final simple checks
if len(raw) < 8 || !strings.Contains(raw, "/") {
return ""
}
return raw
}
|