1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
package skate
import (
"index/suffixarray"
"regexp"
"strings"
)
var (
patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)
)
// SanitizeURL applies various cleanup rules on URLs as found in references.
// Returns an empty string when no URL could be constructed. Still, many
// results will not be a URL after all. XXX: Sometimes a URL contains other
// identifying information, like:
// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
// http://10.3386/w20634https://doi.org/10.3386/w20634
func SanitizeURL(s string) string {
var (
// seems to only be about 15% of total time spent
index = suffixarray.New([]byte(s))
indices = index.Lookup([]byte("http"), -1)
)
if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") {
s = sanitizeRaw(s, index)
if s == "" {
return s
}
}
if len(indices) > 1 {
s = s[0:indices[1]] // only use the first
s = strings.TrimRight(s, ":")
s = strings.TrimRight(s, ";")
}
// http://!!!:
// http://!
// http://"
s = patNonWordDomain.ReplaceAllString(s, `$1$3`)
// http:///en.m.wikipedia.org/ChenLong
s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`)
// http://10.1113/jphysiol.2002.026047
s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
// .diaksestanggal27-03-2017.10.30Wib
// accessedon15
// .Accessed
// Acessoem:10/09/2012
// .Acesso:11Abr
if patAccessedOn.MatchString(s) {
s = patAccessedOn.ReplaceAllString(s, `$1`)
}
// http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience
if patFileExtraSuffix.MatchString(s) {
s = patFileExtraSuffix.ReplaceAllString(s, `$1`)
}
return s
}
func sanitizeRaw(s string, index *suffixarray.Index) string {
if len(s) < 4 {
return ""
}
if !strings.Contains(s, ".") {
return ""
}
indices := index.Lookup([]byte("www."), 1)
if len(indices) > 0 {
s = "http://" + s[indices[0]:]
} else {
s = "http://" + s
}
return s
// Re-trievedfrom
}
|