diff options
-rw-r--r-- | skate/url.go | 51 |
1 files changed, 22 insertions, 29 deletions
diff --git a/skate/url.go b/skate/url.go index b59103f..d8560ac 100644 --- a/skate/url.go +++ b/skate/url.go @@ -18,22 +18,10 @@ var ( okSchemas = []string{"http://", "https://", "ftp://"} ) -// hasAnyPrefixes returns true, if any of the prefixes matches string s. -func hasAnyPrefix(s string, prefix []string) bool { - for _, p := range prefix { - if strings.HasPrefix(s, p) { - return true - } - } - return false -} - -// SanitizeURL applies various cleanup rules on URLs as found in references. -// Returns an empty string when no URL could be constructed. Still, many -// results will not be a URL after all. XXX: Sometimes a URL contains other -// identifying information, like: -// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. -// http://10.3386/w20634https://doi.org/10.3386/w20634 +// SanitizeURL applies various cleanup rules on URLs (as they are found in +// references extracted with GROBID). Returns an empty string when no URL +// could be discovered. Still, many results will not be a URL even after +// sanitization. func SanitizeURL(s string) string { if !hasAnyPrefix(s, okSchemas) { s = sanitizeRaw(s) @@ -47,23 +35,18 @@ func SanitizeURL(s string) string { indices = index.Lookup([]byte("http"), -1) ) if len(indices) > 1 { + // http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell s = s[0:indices[1]] // only use the first s = strings.TrimRight(s, ":") s = strings.TrimRight(s, ";") } - // http://!!!: - // http://! - // http://" + // http://!!!:, // http://!, // http://" s = patNonWordDomain.ReplaceAllString(s, `$1$3`) // http:///en.m.wikipedia.org/ChenLong s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`) // http://10.1113/jphysiol.2002.026047 s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) - // .diaksestanggal27-03-2017.10.30Wib - // accessedon15 - // .Accessed - // Acessoem:10/09/2012 - // .Acesso:11Abr + // .Acesso:11Abr, accessedon15, ... if patAccessedOn.MatchString(s) { s = patAccessedOn.ReplaceAllString(s, `$1`) } @@ -75,21 +58,31 @@ func SanitizeURL(s string) string { } func sanitizeRaw(s string) string { - index := suffixarray.New([]byte(s)) if len(s) < 4 { return "" } if !strings.Contains(s, ".") { return "" } - indices := index.Lookup([]byte("www."), 1) + var ( + index = suffixarray.New([]byte(s)) + indices = index.Lookup([]byte("www."), 1) + ) if len(indices) > 0 { return "http://" + s[indices[0]:] } if patBrokenSchemaPrefix.MatchString(s) { return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`) } - s = "http://" + s - return s - // Re-trievedfrom + return "http://" + s +} + +// hasAnyPrefixes returns true, if any of the prefixes matches string s. +func hasAnyPrefix(s string, prefix []string) bool { + for _, p := range prefix { + if strings.HasPrefix(s, p) { + return true + } + } + return false } |