From 8f9be813796fa32e8c24b7609c6c60bc5792ebdf Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 12 May 2021 13:24:23 +0200 Subject: add more cases --- skate/url.go | 2 +- skate/url_test.go | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'skate') diff --git a/skate/url.go b/skate/url.go index d55ffc1..1ffff66 100644 --- a/skate/url.go +++ b/skate/url.go @@ -8,7 +8,7 @@ var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) - patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`) + patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`) patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) ) diff --git a/skate/url_test.go b/skate/url_test.go index 535c00c..e1daf30 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -61,6 +61,12 @@ func TestSanitizeURL(t *testing.T) { `http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf`}, {`http://CRAN.R-project.org/package=RTextTools.Zugegriffen:6Juni`, `http://CRAN.R-project.org/package=RTextTools`}, + {`http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf.Acessoem19/11/2014`, + `http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf`}, + {`http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf.Stanzdnia13.04`, + `http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf`}, + {`http://aalc07.psu.edu/papers/jn_typol_class3.pdf.Stanford`, + `http://aalc07.psu.edu/papers/jn_typol_class3.pdf`}, } for _, c := range cases { out := SanitizeURL(c.in) -- cgit v1.2.3