aboutsummaryrefslogtreecommitdiffstats
path: root/skate/doi.go
blob: 8f6049e05877f818cbf4e14f0a8e1cbb8f6186c5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
package skate

import (
	"strings"
)

// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a
// re-implementation of the simple 'clean_doi()' python function.
// It should handle DOI URLs, prefixes, and some forms of mangling, though it
// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled
// suffixes).
// At least lower-cases all DOIs, for more permissive matching.
// Does not validate or convert non-ASCII characters.
// Intended to be performant and used liberally; does not execute any regexes.
// Returns empty string if the input is definitely not a DOI, though is
// relatively permissive and does little validation.
func SanitizeDOI(raw string) string {
	// short-circuits
	if len(raw) < 8 || !strings.Contains(raw, "10.") {
		return ""
	}

	// lower-case and trim whitespace
	raw = strings.ToLower(strings.TrimSpace(raw))

	// if doesn't start with 10., strip any prefix
	start := strings.Index(raw, "10.")
	if start == -1 {
		return ""
	} else if start > 0 {
		raw = raw[start:len(raw)]
	}

	// final simple checks
	if len(raw) < 8 || !strings.Contains(raw, "/") {
		return ""
	}
	return raw
}