aboutsummaryrefslogtreecommitdiffstats
path: root/skate/doi.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/doi.go')
-rw-r--r--skate/doi.go39
1 files changed, 39 insertions, 0 deletions
diff --git a/skate/doi.go b/skate/doi.go
new file mode 100644
index 0000000..8f6049e
--- /dev/null
+++ b/skate/doi.go
@@ -0,0 +1,39 @@
+package skate
+
+import (
+ "strings"
+)
+
+// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a
+// re-implementation of the simple 'clean_doi()' python function.
+// It should handle DOI URLs, prefixes, and some forms of mangling, though it
+// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled
+// suffixes).
+// At least lower-cases all DOIs, for more permissive matching.
+// Does not validate or convert non-ASCII characters.
+// Intended to be performant and used liberally; does not execute any regexes.
+// Returns empty string if the input is definitely not a DOI, though is
+// relatively permissive and does little validation.
+func SanitizeDOI(raw string) string {
+ // short-circuits
+ if len(raw) < 8 || !strings.Contains(raw, "10.") {
+ return ""
+ }
+
+ // lower-case and trim whitespace
+ raw = strings.ToLower(strings.TrimSpace(raw))
+
+ // if doesn't start with 10., strip any prefix
+ start := strings.Index(raw, "10.")
+ if start == -1 {
+ return ""
+ } else if start > 0 {
+ raw = raw[start:len(raw)]
+ }
+
+ // final simple checks
+ if len(raw) < 8 || !strings.Contains(raw, "/") {
+ return ""
+ }
+ return raw
+}