aboutsummaryrefslogtreecommitdiffstats
path: root/skate/url.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/url.go')
-rw-r--r--skate/url.go45
1 files changed, 45 insertions, 0 deletions
diff --git a/skate/url.go b/skate/url.go
new file mode 100644
index 0000000..ed36b73
--- /dev/null
+++ b/skate/url.go
@@ -0,0 +1,45 @@
+package skate
+
+import (
+ "regexp"
+)
+
+var (
+ patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
+ patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
+ patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
+ patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.]?(accessedon|consultado|diaksestanggal|diaksespadatanggal|acesso|accessoem|accessed).*$`)
+ patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`)
+)
+
+// SanitizeURL applies various cleanup rules on URLs as found in references.
+func SanitizeURL(s string) string {
+ // http://!!!:
+ // http://!
+ // http://"
+ s = patNonWordDomain.ReplaceAllString(s, `$1$3`)
+
+ // http:///en.m.wikipedia.org/ChenLong
+ s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`)
+
+ // http://10.1113/jphysiol.2002.026047
+ s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
+
+ // http://10.3386/w20634https://doi.org/10.3386/w20634
+
+ // .diaksestanggal27-03-2017.10.30Wib
+ // accessedon15
+ // .Accessed
+ // Acessoem:10/09/2012
+ // .Acesso:11Abr
+ if patAccessedOn.MatchString(s) {
+ s = patAccessedOn.ReplaceAllString(s, `$1`)
+ }
+
+ // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience
+ if patFileExtraSuffix.MatchString(s) {
+ s = patFileExtraSuffix.ReplaceAllString(s, `$1`)
+ }
+
+ return s
+}