diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-25 16:29:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-25 16:36:10 -0700 |
commit | 9df1ff863eb9729faa9b46effb460c74203969f7 (patch) | |
tree | 03c94403748bd0c54735e011a058e2aba250c3f5 /skate | |
parent | 1c1c2cb5eb983ae26a8a445aee081b147cf0f652 (diff) | |
download | refcat-9df1ff863eb9729faa9b46effb460c74203969f7.tar.gz refcat-9df1ff863eb9729faa9b46effb460c74203969f7.zip |
skate: fast SanitizeDOI helper for normalizing DOIs
Diffstat (limited to 'skate')
-rw-r--r-- | skate/doi.go | 39 | ||||
-rw-r--r-- | skate/doi_test.go | 32 |
2 files changed, 71 insertions, 0 deletions
diff --git a/skate/doi.go b/skate/doi.go new file mode 100644 index 0000000..8f6049e --- /dev/null +++ b/skate/doi.go @@ -0,0 +1,39 @@ +package skate + +import ( + "strings" +) + +// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a +// re-implementation of the simple 'clean_doi()' python function. +// It should handle DOI URLs, prefixes, and some forms of mangling, though it +// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled +// suffixes). +// At least lower-cases all DOIs, for more permissive matching. +// Does not validate or convert non-ASCII characters. +// Intended to be performant and used liberally; does not execute any regexes. +// Returns empty string if the input is definitely not a DOI, though is +// relatively permissive and does little validation. +func SanitizeDOI(raw string) string { + // short-circuits + if len(raw) < 8 || !strings.Contains(raw, "10.") { + return "" + } + + // lower-case and trim whitespace + raw = strings.ToLower(strings.TrimSpace(raw)) + + // if doesn't start with 10., strip any prefix + start := strings.Index(raw, "10.") + if start == -1 { + return "" + } else if start > 0 { + raw = raw[start:len(raw)] + } + + // final simple checks + if len(raw) < 8 || !strings.Contains(raw, "/") { + return "" + } + return raw +} diff --git a/skate/doi_test.go b/skate/doi_test.go new file mode 100644 index 0000000..7a184d3 --- /dev/null +++ b/skate/doi_test.go @@ -0,0 +1,32 @@ +package skate + +import "testing" + +func TestSanitizeDOI(t *testing.T) { + var cases = []struct { + in string + out string + }{ + {"", ""}, + {"a", ""}, + {"???", ""}, + {"10.1234", ""}, + {"10.1234/asdf ", "10.1234/asdf"}, + {"10.1234/ASDF", "10.1234/asdf"}, + {"10.1037/0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"}, + {"http://doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"http://doi.org/10.123", ""}, + {"dx.doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"21924DOI10.1234/asdf ", "10.1234/asdf"}, + {"https://dx.doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"doi:10.1234/asdf ", "10.1234/asdf"}, + {"10.7326/M20-6817", "10.7326/m20-6817"}, + // TODO: {"10.1037//0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"}, + } + for _, c := range cases { + out := SanitizeDOI(c.in) + if out != c.out { + t.Fatalf("got %v, want %v", out, c.out) + } + } +} |