diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 13:16:48 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 13:16:48 +0200 |
commit | 238b26c62f1a7d7863b6a1131e07165e67cfeb37 (patch) | |
tree | c092534eda47246c2cb094e0a1909f5fca725c47 /skate | |
parent | d9920692cb7c457b50b3bc73834598b2d8d5e602 (diff) | |
download | refcat-238b26c62f1a7d7863b6a1131e07165e67cfeb37.tar.gz refcat-238b26c62f1a7d7863b6a1131e07165e67cfeb37.zip |
url cleanup: add cases
Diffstat (limited to 'skate')
-rw-r--r-- | skate/url.go | 2 | ||||
-rw-r--r-- | skate/url_test.go | 18 |
2 files changed, 19 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go index 3ca4163..d55ffc1 100644 --- a/skate/url.go +++ b/skate/url.go @@ -8,7 +8,7 @@ var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) - patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;]?(abgerufen|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|lastaccessed|acesso|accessoem|accessed).*$`) + patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`) patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) ) diff --git a/skate/url_test.go b/skate/url_test.go index ffa315c..535c00c 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -43,6 +43,24 @@ func TestSanitizeURL(t *testing.T) { `http://2015.veneziabiennale-japanpavilion.jp/en/`}, {`http://-annalsofneurosciences.org/journal/index.php/annal/article/view/43/67`, `http://annalsofneurosciences.org/journal/index.php/annal/article/view/43/67`}, + {`http://-www.gifted.uconn.edu/Siegle/Dissertations/Eric%20Mann.pdf.Diunduh15`, + `http://www.gifted.uconn.edu/Siegle/Dissertations/Eric%20Mann.pdf`}, + {`http://-www.suparlan.com/pages/posts/.Diakses15Pebruari`, + `http://www.suparlan.com/pages/posts/`}, + {`http://...books.google.com/books?isbn=0873552601`, + `http://books.google.com/books?isbn=0873552601`}, + {`http://.R-project.org`, + `http://R-project.org`}, + {`http://.amazona.com/academia.edu.documents//autogestion.pdfRecibido:24demayode2017`, + `http://amazona.com/academia.edu.documents//autogestion.pdf`}, + {`http://10.1007/s00779-012-0615-1`, + `https://doi.org/10.1007/s00779-012-0615-1`}, + {`http://20.132.48.254/PDFS/ED495503.pdf.Accessedat`, + `http://20.132.48.254/PDFS/ED495503.pdf`}, + {`http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf,abgerufenam19`, + `http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf`}, + {`http://CRAN.R-project.org/package=RTextTools.Zugegriffen:6Juni`, + `http://CRAN.R-project.org/package=RTextTools`}, } for _, c := range cases { out := SanitizeURL(c.in) |