diff options
Diffstat (limited to 'skate/url.go')
| -rw-r--r-- | skate/url.go | 16 | 
1 files changed, 12 insertions, 4 deletions
diff --git a/skate/url.go b/skate/url.go index 1ffff66..539613d 100644 --- a/skate/url.go +++ b/skate/url.go @@ -1,6 +1,7 @@  package skate  import ( +	"index/suffixarray"  	"regexp"  ) @@ -8,12 +9,21 @@ var (  	patNonWordDomain       = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)  	patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)  	patHttpDOI             = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) -	patAccessedOn          = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`) -	patFileExtraSuffix     = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) +	patAccessedOn          = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) +	patFileExtraSuffix     = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`)  )  // SanitizeURL applies various cleanup rules on URLs as found in references. +// XXX: Sometimes a URL contains other identifying information, like: +// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. +// http://10.3386/w20634https://doi.org/10.3386/w20634  func SanitizeURL(s string) string { +	index := suffixarray.New([]byte(s)) +	indices := index.Lookup([]byte("http"), -1) +	if len(indices) > 1 { +		s = s[0:indices[1]] // only use the first +	} +  	// http://!!!:  	// http://!  	// http://" @@ -25,8 +35,6 @@ func SanitizeURL(s string) string {  	// http://10.1113/jphysiol.2002.026047  	s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) -	// http://10.3386/w20634https://doi.org/10.3386/w20634 -  	// .diaksestanggal27-03-2017.10.30Wib  	// accessedon15  	// .Accessed  | 
