diff options
Diffstat (limited to 'skate/doi.go')
-rw-r--r-- | skate/doi.go | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/skate/doi.go b/skate/doi.go new file mode 100644 index 0000000..8f6049e --- /dev/null +++ b/skate/doi.go @@ -0,0 +1,39 @@ +package skate + +import ( + "strings" +) + +// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a +// re-implementation of the simple 'clean_doi()' python function. +// It should handle DOI URLs, prefixes, and some forms of mangling, though it +// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled +// suffixes). +// At least lower-cases all DOIs, for more permissive matching. +// Does not validate or convert non-ASCII characters. +// Intended to be performant and used liberally; does not execute any regexes. +// Returns empty string if the input is definitely not a DOI, though is +// relatively permissive and does little validation. +func SanitizeDOI(raw string) string { + // short-circuits + if len(raw) < 8 || !strings.Contains(raw, "10.") { + return "" + } + + // lower-case and trim whitespace + raw = strings.ToLower(strings.TrimSpace(raw)) + + // if doesn't start with 10., strip any prefix + start := strings.Index(raw, "10.") + if start == -1 { + return "" + } else if start > 0 { + raw = raw[start:len(raw)] + } + + // final simple checks + if len(raw) < 8 || !strings.Contains(raw, "/") { + return "" + } + return raw +} |