aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/url.go2
-rw-r--r--skate/url_test.go18
2 files changed, 19 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go
index 3ca4163..d55ffc1 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -8,7 +8,7 @@ var (
patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
- patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;]?(abgerufen|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|lastaccessed|acesso|accessoem|accessed).*$`)
+ patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`)
patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`)
)
diff --git a/skate/url_test.go b/skate/url_test.go
index ffa315c..535c00c 100644
--- a/skate/url_test.go
+++ b/skate/url_test.go
@@ -43,6 +43,24 @@ func TestSanitizeURL(t *testing.T) {
`http://2015.veneziabiennale-japanpavilion.jp/en/`},
{`http://-annalsofneurosciences.org/journal/index.php/annal/article/view/43/67`,
`http://annalsofneurosciences.org/journal/index.php/annal/article/view/43/67`},
+ {`http://-www.gifted.uconn.edu/Siegle/Dissertations/Eric%20Mann.pdf.Diunduh15`,
+ `http://www.gifted.uconn.edu/Siegle/Dissertations/Eric%20Mann.pdf`},
+ {`http://-www.suparlan.com/pages/posts/.Diakses15Pebruari`,
+ `http://www.suparlan.com/pages/posts/`},
+ {`http://...books.google.com/books?isbn=0873552601`,
+ `http://books.google.com/books?isbn=0873552601`},
+ {`http://.R-project.org`,
+ `http://R-project.org`},
+ {`http://.amazona.com/academia.edu.documents//autogestion.pdfRecibido:24demayode2017`,
+ `http://amazona.com/academia.edu.documents//autogestion.pdf`},
+ {`http://10.1007/s00779-012-0615-1`,
+ `https://doi.org/10.1007/s00779-012-0615-1`},
+ {`http://20.132.48.254/PDFS/ED495503.pdf.Accessedat`,
+ `http://20.132.48.254/PDFS/ED495503.pdf`},
+ {`http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf,abgerufenam19`,
+ `http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf`},
+ {`http://CRAN.R-project.org/package=RTextTools.Zugegriffen:6Juni`,
+ `http://CRAN.R-project.org/package=RTextTools`},
}
for _, c := range cases {
out := SanitizeURL(c.in)