aboutsummaryrefslogtreecommitdiffstats
path: root/skate/url.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/url.go')
-rw-r--r--skate/url.go16
1 files changed, 12 insertions, 4 deletions
diff --git a/skate/url.go b/skate/url.go
index 1ffff66..539613d 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -1,6 +1,7 @@
package skate
import (
+ "index/suffixarray"
"regexp"
)
@@ -8,12 +9,21 @@ var (
patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
- patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`)
- patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`)
+ patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
+ patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`)
)
// SanitizeURL applies various cleanup rules on URLs as found in references.
+// XXX: Sometimes a URL contains other identifying information, like:
+// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
+// http://10.3386/w20634https://doi.org/10.3386/w20634
func SanitizeURL(s string) string {
+ index := suffixarray.New([]byte(s))
+ indices := index.Lookup([]byte("http"), -1)
+ if len(indices) > 1 {
+ s = s[0:indices[1]] // only use the first
+ }
+
// http://!!!:
// http://!
// http://"
@@ -25,8 +35,6 @@ func SanitizeURL(s string) string {
// http://10.1113/jphysiol.2002.026047
s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
- // http://10.3386/w20634https://doi.org/10.3386/w20634
-
// .diaksestanggal27-03-2017.10.30Wib
// accessedon15
// .Accessed