aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-25 16:29:22 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-25 16:36:10 -0700
commit9df1ff863eb9729faa9b46effb460c74203969f7 (patch)
tree03c94403748bd0c54735e011a058e2aba250c3f5 /skate
parent1c1c2cb5eb983ae26a8a445aee081b147cf0f652 (diff)
downloadrefcat-9df1ff863eb9729faa9b46effb460c74203969f7.tar.gz
refcat-9df1ff863eb9729faa9b46effb460c74203969f7.zip
skate: fast SanitizeDOI helper for normalizing DOIs
Diffstat (limited to 'skate')
-rw-r--r--skate/doi.go39
-rw-r--r--skate/doi_test.go32
2 files changed, 71 insertions, 0 deletions
diff --git a/skate/doi.go b/skate/doi.go
new file mode 100644
index 0000000..8f6049e
--- /dev/null
+++ b/skate/doi.go
@@ -0,0 +1,39 @@
+package skate
+
+import (
+ "strings"
+)
+
+// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a
+// re-implementation of the simple 'clean_doi()' python function.
+// It should handle DOI URLs, prefixes, and some forms of mangling, though it
+// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled
+// suffixes).
+// At least lower-cases all DOIs, for more permissive matching.
+// Does not validate or convert non-ASCII characters.
+// Intended to be performant and used liberally; does not execute any regexes.
+// Returns empty string if the input is definitely not a DOI, though is
+// relatively permissive and does little validation.
+func SanitizeDOI(raw string) string {
+ // short-circuits
+ if len(raw) < 8 || !strings.Contains(raw, "10.") {
+ return ""
+ }
+
+ // lower-case and trim whitespace
+ raw = strings.ToLower(strings.TrimSpace(raw))
+
+ // if doesn't start with 10., strip any prefix
+ start := strings.Index(raw, "10.")
+ if start == -1 {
+ return ""
+ } else if start > 0 {
+ raw = raw[start:len(raw)]
+ }
+
+ // final simple checks
+ if len(raw) < 8 || !strings.Contains(raw, "/") {
+ return ""
+ }
+ return raw
+}
diff --git a/skate/doi_test.go b/skate/doi_test.go
new file mode 100644
index 0000000..7a184d3
--- /dev/null
+++ b/skate/doi_test.go
@@ -0,0 +1,32 @@
+package skate
+
+import "testing"
+
+func TestSanitizeDOI(t *testing.T) {
+ var cases = []struct {
+ in string
+ out string
+ }{
+ {"", ""},
+ {"a", ""},
+ {"???", ""},
+ {"10.1234", ""},
+ {"10.1234/asdf ", "10.1234/asdf"},
+ {"10.1234/ASDF", "10.1234/asdf"},
+ {"10.1037/0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"},
+ {"http://doi.org/10.1234/asdf ", "10.1234/asdf"},
+ {"http://doi.org/10.123", ""},
+ {"dx.doi.org/10.1234/asdf ", "10.1234/asdf"},
+ {"21924DOI10.1234/asdf ", "10.1234/asdf"},
+ {"https://dx.doi.org/10.1234/asdf ", "10.1234/asdf"},
+ {"doi:10.1234/asdf ", "10.1234/asdf"},
+ {"10.7326/M20-6817", "10.7326/m20-6817"},
+ // TODO: {"10.1037//0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"},
+ }
+ for _, c := range cases {
+ out := SanitizeDOI(c.in)
+ if out != c.out {
+ t.Fatalf("got %v, want %v", out, c.out)
+ }
+ }
+}