aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/url.go2
-rw-r--r--skate/url_test.go6
2 files changed, 7 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go
index d55ffc1..1ffff66 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -8,7 +8,7 @@ var (
patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
- patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`)
+ patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`)
patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`)
)
diff --git a/skate/url_test.go b/skate/url_test.go
index 535c00c..e1daf30 100644
--- a/skate/url_test.go
+++ b/skate/url_test.go
@@ -61,6 +61,12 @@ func TestSanitizeURL(t *testing.T) {
`http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf`},
{`http://CRAN.R-project.org/package=RTextTools.Zugegriffen:6Juni`,
`http://CRAN.R-project.org/package=RTextTools`},
+ {`http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf.Acessoem19/11/2014`,
+ `http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf`},
+ {`http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf.Stanzdnia13.04`,
+ `http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf`},
+ {`http://aalc07.psu.edu/papers/jn_typol_class3.pdf.Stanford`,
+ `http://aalc07.psu.edu/papers/jn_typol_class3.pdf`},
}
for _, c := range cases {
out := SanitizeURL(c.in)