diff options
Diffstat (limited to 'skate/url.go')
-rw-r--r-- | skate/url.go | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/skate/url.go b/skate/url.go new file mode 100644 index 0000000..ed36b73 --- /dev/null +++ b/skate/url.go @@ -0,0 +1,45 @@ +package skate + +import ( + "regexp" +) + +var ( + patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) + patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) + patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) + patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.]?(accessedon|consultado|diaksestanggal|diaksespadatanggal|acesso|accessoem|accessed).*$`) + patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) +) + +// SanitizeURL applies various cleanup rules on URLs as found in references. +func SanitizeURL(s string) string { + // http://!!!: + // http://! + // http://" + s = patNonWordDomain.ReplaceAllString(s, `$1$3`) + + // http:///en.m.wikipedia.org/ChenLong + s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`) + + // http://10.1113/jphysiol.2002.026047 + s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) + + // http://10.3386/w20634https://doi.org/10.3386/w20634 + + // .diaksestanggal27-03-2017.10.30Wib + // accessedon15 + // .Accessed + // Acessoem:10/09/2012 + // .Acesso:11Abr + if patAccessedOn.MatchString(s) { + s = patAccessedOn.ReplaceAllString(s, `$1`) + } + + // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience + if patFileExtraSuffix.MatchString(s) { + s = patFileExtraSuffix.ReplaceAllString(s, `$1`) + } + + return s +} |