aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/url.go6
-rw-r--r--skate/url_test.go17
2 files changed, 19 insertions, 4 deletions
diff --git a/skate/url.go b/skate/url.go
index 66b9312..b104651 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -15,7 +15,9 @@ var (
)
// SanitizeURL applies various cleanup rules on URLs as found in references.
-// XXX: Sometimes a URL contains other identifying information, like:
+// Returns an empty string when no URL could be constructed. Still, many
+// results will not be a URL after all. XXX: Sometimes a URL contains other
+// identifying information, like:
// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
// http://10.3386/w20634https://doi.org/10.3386/w20634
func SanitizeURL(s string) string {
@@ -24,7 +26,7 @@ func SanitizeURL(s string) string {
index = suffixarray.New([]byte(s))
indices = index.Lookup([]byte("http"), -1)
)
- if !strings.HasPrefix(s, "http") {
+ if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") {
s = sanitizeRaw(s, index)
if s == "" {
return s
diff --git a/skate/url_test.go b/skate/url_test.go
index 073509b..d866e7a 100644
--- a/skate/url_test.go
+++ b/skate/url_test.go
@@ -106,6 +106,8 @@ func TestSanitizeURL(t *testing.T) {
`http://CRAN.R-project.org/package=vegan`},
{`CRD42014009228.www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`,
`http://www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`},
+ {`ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`,
+ `ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`},
}
for _, c := range cases {
out := SanitizeURL(c.in)
@@ -116,7 +118,18 @@ func TestSanitizeURL(t *testing.T) {
}
func BenchmarkSanitizeURL(b *testing.B) {
- for n := 0; n < b.N; n++ {
- SanitizeURL(`http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`)
+ var bms = []struct {
+ name string
+ in string
+ }{
+ {"http", `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`},
+ {"plain", `0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`},
+ }
+ for _, bm := range bms {
+ b.Run(bm.name, func(b *testing.B) {
+ for n := 0; n < b.N; n++ {
+ SanitizeURL(bm.in)
+ }
+ })
}
}