diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-14 01:07:05 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-14 01:07:05 +0200 | 
| commit | 50fbd0555d3533511de8b703ea67a9dc1f4bc415 (patch) | |
| tree | ce5a17c1c4a19aa6be78e0614893892e56c38b89 | |
| parent | eccb0c95d5cb9aecff10bfddac4fee63ed888020 (diff) | |
| download | refcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.tar.gz refcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.zip | |
cleanup tweaks
| -rw-r--r-- | skate/url.go | 51 | 
1 files changed, 22 insertions, 29 deletions
| diff --git a/skate/url.go b/skate/url.go index b59103f..d8560ac 100644 --- a/skate/url.go +++ b/skate/url.go @@ -18,22 +18,10 @@ var (  	okSchemas = []string{"http://", "https://", "ftp://"}  ) -// hasAnyPrefixes returns true, if any of the prefixes matches string s. -func hasAnyPrefix(s string, prefix []string) bool { -	for _, p := range prefix { -		if strings.HasPrefix(s, p) { -			return true -		} -	} -	return false -} - -// SanitizeURL applies various cleanup rules on URLs as found in references. -// Returns an empty string when no URL could be constructed. Still, many -// results will not be a URL after all.  XXX: Sometimes a URL contains other -// identifying information, like: -// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. -// http://10.3386/w20634https://doi.org/10.3386/w20634 +// SanitizeURL applies various cleanup rules on URLs (as they are found in +// references extracted with GROBID).  Returns an empty string when no URL +// could be discovered. Still, many results will not be a URL even after +// sanitization.  func SanitizeURL(s string) string {  	if !hasAnyPrefix(s, okSchemas) {  		s = sanitizeRaw(s) @@ -47,23 +35,18 @@ func SanitizeURL(s string) string {  		indices = index.Lookup([]byte("http"), -1)  	)  	if len(indices) > 1 { +		// http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell  		s = s[0:indices[1]] // only use the first  		s = strings.TrimRight(s, ":")  		s = strings.TrimRight(s, ";")  	} -	// http://!!!: -	// http://! -	// http://" +	// http://!!!:, // http://!, // http://"  	s = patNonWordDomain.ReplaceAllString(s, `$1$3`)  	// http:///en.m.wikipedia.org/ChenLong  	s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`)  	// http://10.1113/jphysiol.2002.026047  	s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) -	// .diaksestanggal27-03-2017.10.30Wib -	// accessedon15 -	// .Accessed -	// Acessoem:10/09/2012 -	// .Acesso:11Abr +	// .Acesso:11Abr, accessedon15, ...  	if patAccessedOn.MatchString(s) {  		s = patAccessedOn.ReplaceAllString(s, `$1`)  	} @@ -75,21 +58,31 @@ func SanitizeURL(s string) string {  }  func sanitizeRaw(s string) string { -	index := suffixarray.New([]byte(s))  	if len(s) < 4 {  		return ""  	}  	if !strings.Contains(s, ".") {  		return ""  	} -	indices := index.Lookup([]byte("www."), 1) +	var ( +		index   = suffixarray.New([]byte(s)) +		indices = index.Lookup([]byte("www."), 1) +	)  	if len(indices) > 0 {  		return "http://" + s[indices[0]:]  	}  	if patBrokenSchemaPrefix.MatchString(s) {  		return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`)  	} -	s = "http://" + s -	return s -	// Re-trievedfrom +	return "http://" + s +} + +// hasAnyPrefixes returns true, if any of the prefixes matches string s. +func hasAnyPrefix(s string, prefix []string) bool { +	for _, p := range prefix { +		if strings.HasPrefix(s, p) { +			return true +		} +	} +	return false  } | 
