diff options
Diffstat (limited to 'skate')
-rw-r--r-- | skate/url.go | 6 | ||||
-rw-r--r-- | skate/url_test.go | 17 |
2 files changed, 19 insertions, 4 deletions
diff --git a/skate/url.go b/skate/url.go index 66b9312..b104651 100644 --- a/skate/url.go +++ b/skate/url.go @@ -15,7 +15,9 @@ var ( ) // SanitizeURL applies various cleanup rules on URLs as found in references. -// XXX: Sometimes a URL contains other identifying information, like: +// Returns an empty string when no URL could be constructed. Still, many +// results will not be a URL after all. XXX: Sometimes a URL contains other +// identifying information, like: // http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. // http://10.3386/w20634https://doi.org/10.3386/w20634 func SanitizeURL(s string) string { @@ -24,7 +26,7 @@ func SanitizeURL(s string) string { index = suffixarray.New([]byte(s)) indices = index.Lookup([]byte("http"), -1) ) - if !strings.HasPrefix(s, "http") { + if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") { s = sanitizeRaw(s, index) if s == "" { return s diff --git a/skate/url_test.go b/skate/url_test.go index 073509b..d866e7a 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -106,6 +106,8 @@ func TestSanitizeURL(t *testing.T) { `http://CRAN.R-project.org/package=vegan`}, {`CRD42014009228.www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`, `http://www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`}, + {`ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`, + `ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`}, } for _, c := range cases { out := SanitizeURL(c.in) @@ -116,7 +118,18 @@ func TestSanitizeURL(t *testing.T) { } func BenchmarkSanitizeURL(b *testing.B) { - for n := 0; n < b.N; n++ { - SanitizeURL(`http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`) + var bms = []struct { + name string + in string + }{ + {"http", `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`}, + {"plain", `0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`}, + } + for _, bm := range bms { + b.Run(bm.name, func(b *testing.B) { + for n := 0; n < b.N; n++ { + SanitizeURL(bm.in) + } + }) } } |