From 27046b7576d92b2cbfe21450ed2ad64ba37acf66 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 12 May 2021 12:20:31 +0200 Subject: prepare url sanitization --- skate/url.go | 45 +++++++++++++++++++++++++++++++++++++++++++++ skate/url_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 skate/url.go create mode 100644 skate/url_test.go (limited to 'skate') diff --git a/skate/url.go b/skate/url.go new file mode 100644 index 0000000..ed36b73 --- /dev/null +++ b/skate/url.go @@ -0,0 +1,45 @@ +package skate + +import ( + "regexp" +) + +var ( + patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) + patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) + patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) + patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.]?(accessedon|consultado|diaksestanggal|diaksespadatanggal|acesso|accessoem|accessed).*$`) + patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) +) + +// SanitizeURL applies various cleanup rules on URLs as found in references. +func SanitizeURL(s string) string { + // http://!!!: + // http://! + // http://" + s = patNonWordDomain.ReplaceAllString(s, `$1$3`) + + // http:///en.m.wikipedia.org/ChenLong + s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`) + + // http://10.1113/jphysiol.2002.026047 + s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) + + // http://10.3386/w20634https://doi.org/10.3386/w20634 + + // .diaksestanggal27-03-2017.10.30Wib + // accessedon15 + // .Accessed + // Acessoem:10/09/2012 + // .Acesso:11Abr + if patAccessedOn.MatchString(s) { + s = patAccessedOn.ReplaceAllString(s, `$1`) + } + + // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience + if patFileExtraSuffix.MatchString(s) { + s = patFileExtraSuffix.ReplaceAllString(s, `$1`) + } + + return s +} diff --git a/skate/url_test.go b/skate/url_test.go new file mode 100644 index 0000000..5b3992e --- /dev/null +++ b/skate/url_test.go @@ -0,0 +1,39 @@ +package skate + +import "testing" + +func TestSanitizeURL(t *testing.T) { + var cases = []struct { + in string + out string + }{ + {"", ""}, + {"http://abc.com", "http://abc.com"}, + {"http://!!abc.com", "http://abc.com"}, + {`http://"www.phaelos.com/oubre.html`, `http://www.phaelos.com/oubre.html`}, + {`http://!www.rkm=journal.de/archives/13383`, `http://www.rkm=journal.de/archives/13383`}, + {`http:///en.m.wikipedia.org/ChenLong`, `http://en.m.wikipedia.org/ChenLong`}, + {`http://10.1111/joim.12348`, `https://doi.org/10.1111/joim.12348`}, + {`http://10.1113/jphysiol.2002.026047`, `https://doi.org/10.1113/jphysiol.2002.026047`}, + {`http://10.30.3.16/moodle/course/view.php?id=25`, `http://10.30.3.16/moodle/course/view.php?id=25`}, + {`http://10.3266/RevEspEndocrinolPediatr.pre2015.Nov.330`, `https://doi.org/10.3266/RevEspEndocrinolPediatr.pre2015.Nov.330`}, + {`http://120.107.180.177/1832/9901/099-2-07p.pdf.Accessed`, `http://120.107.180.177/1832/9901/099-2-07p.pdf`}, + {`http://120cartas.ig.com.br/wp/maio-de-2008-um-aniversario-de-120-anos/.Acessoem:set`, + `http://120cartas.ig.com.br/wp/maio-de-2008-um-aniversario-de-120-anos/`}, + {`http://122.53.86.125/NNS/8thNNS.pdf.Accessed`, `http://122.53.86.125/NNS/8thNNS.pdf`}, + {`http://122.53.86.125/facts_figures2011.pdf.Accessedon`, + `http://122.53.86.125/facts_figures2011.pdf`}, + {`http://129.3.20.41/eps/fin/papers/0507/0507016.pdf.diaksespadatanggal23Januari`, + `http://129.3.20.41/eps/fin/papers/0507/0507016.pdf`}, + {`http://129.3.20.41/eps/hew/papers/0512/0512001.pdfAccessed1`, + `http://129.3.20.41/eps/hew/papers/0512/0512001.pdf`}, + {`http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience`, + `http://140.120.197.173/Ecology/Download/Timing-MSChart.zip`}, + } + for _, c := range cases { + out := SanitizeURL(c.in) + if out != c.out { + t.Fatalf("got %v, want %v", out, c.out) + } + } +} -- cgit v1.2.3