skate/url.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

package skate

import (
	"index/suffixarray"
	"regexp"
	"sort"
	"strings"
)

var (
	patNonWordDomain       = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
	patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
	patHttpDOI             = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
	patAccessedOn          = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
	patFileExtraSuffix     = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)
	patBrokenSchemaPrefix  = regexp.MustCompile(`(http|https)\W+(.*)`)

	// Note: [...] and naming things, https://stackoverflow.com/q/56297974/89391.
	okPrefixes = []string{"http://", "https://", "ftp://"}
)

// SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in
// references extracted with GROBID). Returns an empty string when no URL could
// be discovered. Still, many results will not be valid links even after
// sanitization. This is a surprisingly expensive operation, roughly processing
// 20k urls/s, only. A short circuit with a successful url.Parse does not
// really work, as syntactically valid URL strings may still be improbable
// URLs, e.g.  http://!!!x.com, etc.
func SanitizeURL(s string) string {
	if !HasAnyPrefix(s, okPrefixes) {
		s = sanitizeRaw(s)
		if s == "" {
			return s
		}
	}
	index := suffixarray.New([]byte(s)) // seems to only be about 15% of total time spent
	indices := index.Lookup([]byte("http"), -1)
	if len(indices) == 1 {
		// ISSN-2177-4129periodicos.ufpel.edu.br/ojs2/index.php/Memoriahttp://dx.doi.org/10.15210/rmr.v8i14.7485
		s = s[indices[0]:]
	} else if len(indices) > 1 {
		sort.Ints(indices)
		// http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell
		s = s[indices[0]:indices[1]]
		s = strings.TrimRight(s, ":")
		s = strings.TrimRight(s, ";")
	}
	// http://!!!:, // http://!, // http://"
	s = patNonWordDomain.ReplaceAllString(s, `$1$3`)
	// http:///en.m.wikipedia.org/ChenLong
	s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`)
	// http://10.1113/jphysiol.2002.026047
	s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
	// .Acesso:11Abr, accessedon15, ...
	if patAccessedOn.MatchString(s) {
		s = patAccessedOn.ReplaceAllString(s, `$1`)
	}
	// http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience
	if patFileExtraSuffix.MatchString(s) {
		s = patFileExtraSuffix.ReplaceAllString(s, `$1`)
	}
	return s
}

func sanitizeRaw(s string) string {
	if len(s) < 4 {
		return ""
	}
	if !strings.Contains(s, ".") {
		return ""
	}
	index := suffixarray.New([]byte(s))
	indices := index.Lookup([]byte("www."), 1)
	sort.Ints(indices)
	if len(indices) > 0 {
		return "http://" + s[indices[0]:]
	}
	if patBrokenSchemaPrefix.MatchString(s) {
		return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`)
	}
	return "http://" + s
}

// HasAnyPrefixes returns true, if any of the prefixes matches string s.
func HasAnyPrefix(s string, prefix []string) bool {
	for _, p := range prefix {
		if strings.HasPrefix(s, p) {
			return true
		}
	}
	return false
}