package skate import ( "strings" ) // SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a // re-implementation of the simple 'clean_doi()' python function. // It should handle DOI URLs, prefixes, and some forms of mangling, though it // does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled // suffixes). // At least lower-cases all DOIs, for more permissive matching. // Does not validate or convert non-ASCII characters. // Intended to be performant and used liberally; does not execute any regexes. // Returns empty string if the input is definitely not a DOI, though is // relatively permissive and does little validation. func SanitizeDOI(raw string) string { // short-circuits if len(raw) < 8 || !strings.Contains(raw, "10.") { return "" } // lower-case and trim whitespace raw = strings.ToLower(strings.TrimSpace(raw)) // if doesn't start with 10., strip any prefix start := strings.Index(raw, "10.") if start == -1 { return "" } else if start > 0 { raw = raw[start:len(raw)] } // final simple checks if len(raw) < 8 || !strings.Contains(raw, "/") { return "" } return raw }