aboutsummaryrefslogtreecommitdiffstats
path: root/skate/url_test.go
blob: caf05a479ae1404f2b0a84232f557cf3c44e48ee (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
package skate

import "testing"

func TestSanitizeURL(t *testing.T) {
	var cases = []struct {
		in  string
		out string
	}{
		{"", ""},
		{"a", ""},
		{"???", ""},
		{"???***", ""},
		{"???***___123", ""},
		{"http://abc.com", "http://abc.com"},
		{"http://!!abc.com", "http://abc.com"},
		{`http://"www.phaelos.com/oubre.html`, `http://www.phaelos.com/oubre.html`},
		{`http://!www.rkm=journal.de/archives/13383`, `http://www.rkm=journal.de/archives/13383`},
		{`http:///en.m.wikipedia.org/ChenLong`, `http://en.m.wikipedia.org/ChenLong`},
		{`http://10.1111/joim.12348`, `https://doi.org/10.1111/joim.12348`},
		{`http://10.1113/jphysiol.2002.026047`, `https://doi.org/10.1113/jphysiol.2002.026047`},
		{`http://10.30.3.16/moodle/course/view.php?id=25`, `http://10.30.3.16/moodle/course/view.php?id=25`},
		{`http://10.3266/RevEspEndocrinolPediatr.pre2015.Nov.330`, `https://doi.org/10.3266/RevEspEndocrinolPediatr.pre2015.Nov.330`},
		{`http://120.107.180.177/1832/9901/099-2-07p.pdf.Accessed`, `http://120.107.180.177/1832/9901/099-2-07p.pdf`},
		{`http://120cartas.ig.com.br/wp/maio-de-2008-um-aniversario-de-120-anos/.Acessoem:set`,
			`http://120cartas.ig.com.br/wp/maio-de-2008-um-aniversario-de-120-anos/`},
		{`http://122.53.86.125/NNS/8thNNS.pdf.Accessed`, `http://122.53.86.125/NNS/8thNNS.pdf`},
		{`http://122.53.86.125/facts_figures2011.pdf.Accessedon`,
			`http://122.53.86.125/facts_figures2011.pdf`},
		{`http://129.3.20.41/eps/fin/papers/0507/0507016.pdf.diaksespadatanggal23Januari`,
			`http://129.3.20.41/eps/fin/papers/0507/0507016.pdf`},
		{`http://129.3.20.41/eps/hew/papers/0512/0512001.pdfAccessed1`,
			`http://129.3.20.41/eps/hew/papers/0512/0512001.pdf`},
		{`http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience`,
			`http://140.120.197.173/Ecology/Download/Timing-MSChart.zip`},
		{`141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`,
			`http://141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`},
		{`http://141.232.10.32/pm/recover/recover_docs/perf_measures/062812_rec_pm_scs_salinity_flbay.pdfRECOVER`,
			`http://141.232.10.32/pm/recover/recover_docs/perf_measures/062812_rec_pm_scs_salinity_flbay.pdf`},
		{`http://2010.census.gov/news/releases/operations/cb11-cn125.html.lastaccessed4`,
			`http://2010.census.gov/news/releases/operations/cb11-cn125.html`},
		{`http://2014hit.blogspot.com.tr/2014/12/george-gerbnerin-tv-arastrmas-ve-ekme.htmladresindenedinilmiştir`,
			`http://2014hit.blogspot.com.tr/2014/12/george-gerbnerin-tv-arastrmas-ve-ekme.html`},
		{`http://2015.ses.org.tr/wp-ontent/uploads/toplumsalcinsiyetrolleri.pdfsayfasındanulaşıl-mıştır`,
			`http://2015.ses.org.tr/wp-ontent/uploads/toplumsalcinsiyetrolleri.pdf`},
		{`http://2015.veneziabiennale-japanpavilion.jp/en/Consultadael20deoctubrealas14`,
			`http://2015.veneziabiennale-japanpavilion.jp/en/`},
		{`http://-annalsofneurosciences.org/journal/index.php/annal/article/view/43/67`,
			`http://annalsofneurosciences.org/journal/index.php/annal/article/view/43/67`},
		{`http://-www.gifted.uconn.edu/Siegle/Dissertations/Eric%20Mann.pdf.Diunduh15`,
			`http://www.gifted.uconn.edu/Siegle/Dissertations/Eric%20Mann.pdf`},
		{`http://-www.suparlan.com/pages/posts/.Diakses15Pebruari`,
			`http://www.suparlan.com/pages/posts/`},
		{`http://...books.google.com/books?isbn=0873552601`,
			`http://books.google.com/books?isbn=0873552601`},
		{`http://.R-project.org`,
			`http://R-project.org`},
		{`http://.amazona.com/academia.edu.documents//autogestion.pdfRecibido:24demayode2017`,
			`http://amazona.com/academia.edu.documents//autogestion.pdf`},
		{`http://10.1007/s00779-012-0615-1`,
			`https://doi.org/10.1007/s00779-012-0615-1`},
		{`http://20.132.48.254/PDFS/ED495503.pdf.Accessedat`,
			`http://20.132.48.254/PDFS/ED495503.pdf`},
		{`http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf,abgerufenam19`,
			`http://82.198.195.82/presse/mitteilungen/2007/Stellungnahme_dsn_BDAG_Internet_20071219.pdf`},
		{`http://CRAN.R-project.org/package=RTextTools.Zugegriffen:6Juni`,
			`http://CRAN.R-project.org/package=RTextTools`},
		{`http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf.Acessoem19/11/2014`,
			`http://189.28.128.99/provab/docs/geral/edital_28_02_2012_resultado_provab.pdf`},
		{`http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf.Stanzdnia13.04`,
			`http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf`},
		{`http://aalc07.psu.edu/papers/jn_typol_class3.pdf.Stanford`,
			`http://aalc07.psu.edu/papers/jn_typol_class3.pdf`},
		{`http://aboriginalhealth.flinders.edu.au/Newsletters/2010/Downloads/SHRP%20FINAL%20REPORT%20PART%20TWO%20July%202009.pdfAccessed14/12/2012`,
			`http://aboriginalhealth.flinders.edu.au/Newsletters/2010/Downloads/SHRP%20FINAL%20REPORT%20PART%20TWO%20July%202009.pdf`},
		{`http://about-air.ru/svojstva-vozduha/davlenie-vozduha/normalnoe-atmosfernoe-davlenie.html,доступ-свободный,датаобращения15.04.2017`,
			`http://about-air.ru/svojstva-vozduha/davlenie-vozduha/normalnoe-atmosfernoe-davlenie.html`},
		{`http://acl.ldc.upenn.edu/W/W98/W98-1120.pdfDateofaccess`,
			`http://acl.ldc.upenn.edu/W/W98/W98-1120.pdf`},
		{`http://acl.mit.edu/pa-pers/2012-uber-conference-submitted.pdf//49thIEEE`,
			`http://acl.mit.edu/pa-pers/2012-uber-conference-submitted.pdf`},
		{`http://acoss.org.au/policy/community_services/emergency_relief_handbook/,accessed1st`,
			`http://acoss.org.au/policy/community_services/emergency_relief_handbook/`},
		{`http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/.Ac-cessedon06/12/2016`,
			`http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`},
		{`http://acta.uta.fi/pdf/951-44-4701-6.pdfRHEINDORF`,
			`http://acta.uta.fi/pdf/951-44-4701-6.pdf`},
		{`http://admi.net/jo/20080423/ECE-C0771649A.html.Pageconsultéele25septembre`,
			`http://admi.net/jo/20080423/ECE-C0771649A.html`},
		{`http://admin.localgov.co.uk/his_localgov/view/images/uploaded/Image/childrensblackpool.PDF.Lastaccess8`,
			`http://admin.localgov.co.uk/his_localgov/view/images/uploaded/Image/childrensblackpool.PDF`},
		{`http://aec.ifas.ufl.edu/abrams/step/critical_litreview.pdfİndirme`,
			`http://aec.ifas.ufl.edu/abrams/step/critical_litreview.pdf`},
		{`http://aem.asm.org/Downloadedfrom`, `http://aem.asm.org/`},
		{`http://aem.asm.org/content/67/6/2766.full.pdf+htmlWITTWER`,
			`http://aem.asm.org/content/67/6/2766.full.pdf+html`},
		{`http://agris.fao.org/agris-search/search.do?recordID=BR2013800115https://doi.org/10.5747/ca.2010.v06.n1.a044`,
			`http://agris.fao.org/agris-search/search.do?recordID=BR2013800115`},
		{`http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell`,
			`http://ailab.ist.psu.edu/bcpred/SVMTriP`},
		{`http://aim.bmj.com/content/31/1/23.full.pdf+htmlAcessoem:15Dez`,
			`http://aim.bmj.com/content/31/1/23.full.pdf+html`},
		{`http://ainfo.cnptia.embrapa.br/digital/bitstream/CNPAT-2010/8608/1/Ci-017.pdfAcessed06`,
			`http://ainfo.cnptia.embrapa.br/digital/bitstream/CNPAT-2010/8608/1/Ci-017.pdf`},
		{`12s`, ``},
		{`12spoaspdop`, ``},
		{`0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`,
			`http://www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`},
		{`CRAN.R-project.org/package=vegan`,
			`http://CRAN.R-project.org/package=vegan`},
		{`CRD42014009228.www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`,
			`http://www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`},
		{`ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`,
			`ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`},
		{`ftp-eng.cisco.com/sobgp/index.html`,
			`http://ftp-eng.cisco.com/sobgp/index.html`},
		{`ftp.cdc.gov/pub/Publications/mmwr/SS/SS4703.pdf`,
			`http://ftp.cdc.gov/pub/Publications/mmwr/SS/SS4703.pdf`},
		{`ftpftp.inria.fr`,
			`http://ftpftp.inria.fr`},
		{`http.bglink.com/personal/batakovic`, `http://bglink.com/personal/batakovic`},
		{`http.kalsel.bps.go.id`, `http://kalsel.bps.go.id`},
		{`http.www.admhmao.ru/people/frame.htm`, `http://www.admhmao.ru/people/frame.htm`},
		{`http.worldbank.org/sq`, `http://worldbank.org/sq`},
		{`httpwww.sun.com`, `http://www.sun.com`},
		{`httpswww.unos.org`, `http://www.unos.org`},
		{`ics.uci.edu/pub/ietf/`, `http://ics.uci.edu/pub/ietf/`},
		{`ISSN-2177-4129periodicos.ufpel.edu.br/ojs2/index.php/Memoriahttp://dx.doi.org/10.15210/rmr.v8i14.7485`,
			`http://dx.doi.org/10.15210/rmr.v8i14.7485`},
		{`Shttp://hdl.handle.net/1765/1163`,
			`http://hdl.handle.net/1765/1163`},
		{`cdec.water.ca.gov/misc/DailyPrecip.html`,
			`http://cdec.water.ca.gov/misc/DailyPrecip.html`},
		{`https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=resultados.Accessed22`,
			`https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=resultados`},
		{`https://doi.org/10.1101/2020.06.23.167395doi:bioRxivpreprint`, // TODO: e.g. remove "doi:" or the like
			`https://doi.org/10.1101/2020.06.23.167395doi:bioRxivpreprint`},
		{`mail:claire.wyart@icm-institute.org,claire.wyart@inserm.frhttp://dx.doi.org/10.1016/j.cub.2015.01.006`,
			`http://dx.doi.org/10.1016/j.cub.2015.01.006`},
		{`http://www.nbcnews.com/technology/virtual-cockpit-what-it-takes-fly-drone-1C9319684.Acessoem:15/07/2013`,
			`http://www.nbcnews.com/technology/virtual-cockpit-what-it-takes-fly-drone-1C9319684`},
	}
	for _, c := range cases {
		out := SanitizeURL(c.in)
		if out != c.out {
			t.Fatalf("got %v, want %v", out, c.out)
		}
	}
}

func TestHasAnyPrefix(t *testing.T) {
	var cases = []struct {
		s      string
		prefix []string
		result bool
	}{
		{s: "", prefix: nil, result: false},
		{s: "", prefix: []string{}, result: false},
		{s: "a", prefix: []string{}, result: false},
		{s: "a", prefix: []string{"a"}, result: true},
		{s: "a", prefix: []string{"aa"}, result: false},
		{s: "aa", prefix: []string{"a"}, result: true},
	}
	for _, c := range cases {
		result := HasAnyPrefix(c.s, c.prefix)
		if result != c.result {
			t.Fatalf("got %v, want %v", result, c.result)
		}
	}
}

func BenchmarkSanitizeURL(b *testing.B) {
	var bms = []struct {
		name string
		in   string
	}{
		{`http`, `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`},
		{`plain`, `0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`},
	}
	for _, bm := range bms {
		b.Run(bm.name, func(b *testing.B) {
			for n := 0; n < b.N; n++ {
				SanitizeURL(bm.in)
			}
		})
	}
}