diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 13:24:23 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 13:24:23 +0200 |
commit | 8f9be813796fa32e8c24b7609c6c60bc5792ebdf (patch) | |
tree | dfbafaa0e813034ab202c870a9bc65ef521bef1d /skate | |
parent | 238b26c62f1a7d7863b6a1131e07165e67cfeb37 (diff) | |
download | refcat-8f9be813796fa32e8c24b7609c6c60bc5792ebdf.tar.gz refcat-8f9be813796fa32e8c24b7609c6c60bc5792ebdf.zip |
add more cases
Diffstat (limited to 'skate')
-rw-r--r-- | skate/url.go | 2 | ||||
-rw-r--r-- | skate/url_test.go | 6 |
2 files changed, 7 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go index d55ffc1..1ffff66 100644 --- a/skate/url.go +++ b/skate/url.go @@ -8,7 +8,7 @@ var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) - patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`) + patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`) patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) ) diff --git a/skate/url_test.go b/skate/url_test.go index 535c00c..e1daf30 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -61,6 +61,12 @@ func TestSanitizeURL(t *testing.T) { `http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf`}, {`http://CRAN.R-project.org/package=RTextTools.Zugegriffen:6Juni`, `http://CRAN.R-project.org/package=RTextTools`}, + {`http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf.Acessoem19/11/2014`, + `http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf`}, + {`http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf.Stanzdnia13.04`, + `http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf`}, + {`http://aalc07.psu.edu/papers/jn_typol_class3.pdf.Stanford`, + `http://aalc07.psu.edu/papers/jn_typol_class3.pdf`}, } for _, c := range cases { out := SanitizeURL(c.in) |