cleanup tweaks

author: Martin Czygan <martin.czygan@gmail.com> 2021-05-14 01:07:05 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-05-14 01:07:05 +0200
commit: 50fbd0555d3533511de8b703ea67a9dc1f4bc415 (patch)
tree: ce5a17c1c4a19aa6be78e0614893892e56c38b89
parent: eccb0c95d5cb9aecff10bfddac4fee63ed888020 (diff)
download: refcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.tar.gz
refcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.zip
1 files changed, 22 insertions, 29 deletions
diff --git a/skate/url.go b/skate/url.go
index b59103f..d8560ac 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -18,22 +18,10 @@ var (
 	okSchemas = []string{"http://", "https://", "ftp://"}
 )
 
-// hasAnyPrefixes returns true, if any of the prefixes matches string s.
-func hasAnyPrefix(s string, prefix []string) bool {
-	for _, p := range prefix {
-		if strings.HasPrefix(s, p) {
-			return true
-		}
-	}
-	return false
-}
-
-// SanitizeURL applies various cleanup rules on URLs as found in references.
-// Returns an empty string when no URL could be constructed. Still, many
-// results will not be a URL after all.  XXX: Sometimes a URL contains other
-// identifying information, like:
-// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
-// http://10.3386/w20634https://doi.org/10.3386/w20634
+// SanitizeURL applies various cleanup rules on URLs (as they are found in
+// references extracted with GROBID).  Returns an empty string when no URL
+// could be discovered. Still, many results will not be a URL even after
+// sanitization.
 func SanitizeURL(s string) string {
 	if !hasAnyPrefix(s, okSchemas) {
 		s = sanitizeRaw(s)
@@ -47,23 +35,18 @@ func SanitizeURL(s string) string {
 		indices = index.Lookup([]byte("http"), -1)
 	)
 	if len(indices) > 1 {
+		// http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell
 		s = s[0:indices[1]] // only use the first
 		s = strings.TrimRight(s, ":")
 		s = strings.TrimRight(s, ";")
 	}
-	// http://!!!:
-	// http://!
-	// http://"
+	// http://!!!:, // http://!, // http://"
 	s = patNonWordDomain.ReplaceAllString(s, `$1$3`)
 	// http:///en.m.wikipedia.org/ChenLong
 	s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`)
 	// http://10.1113/jphysiol.2002.026047
 	s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
-	// .diaksestanggal27-03-2017.10.30Wib
-	// accessedon15
-	// .Accessed
-	// Acessoem:10/09/2012
-	// .Acesso:11Abr
+	// .Acesso:11Abr, accessedon15, ...
 	if patAccessedOn.MatchString(s) {
 		s = patAccessedOn.ReplaceAllString(s, `$1`)
 	}
@@ -75,21 +58,31 @@ func SanitizeURL(s string) string {
 }
 
 func sanitizeRaw(s string) string {
-	index := suffixarray.New([]byte(s))
 	if len(s) < 4 {
 		return ""
 	}
 	if !strings.Contains(s, ".") {
 		return ""
 	}
-	indices := index.Lookup([]byte("www."), 1)
+	var (
+		index   = suffixarray.New([]byte(s))
+		indices = index.Lookup([]byte("www."), 1)
+	)
 	if len(indices) > 0 {
 		return "http://" + s[indices[0]:]
 	}
 	if patBrokenSchemaPrefix.MatchString(s) {
 		return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`)
 	}
-	s = "http://" + s
-	return s
-	// Re-trievedfrom
+	return "http://" + s
+}
+
+// hasAnyPrefixes returns true, if any of the prefixes matches string s.
+func hasAnyPrefix(s string, prefix []string) bool {
+	for _, p := range prefix {
+		if strings.HasPrefix(s, p) {
+			return true
+		}
+	}
+	return false
 }
author	Martin Czygan <martin.czygan@gmail.com>	2021-05-14 01:07:05 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-05-14 01:07:05 +0200
commit	50fbd0555d3533511de8b703ea67a9dc1f4bc415 (patch)
tree	ce5a17c1c4a19aa6be78e0614893892e56c38b89
parent	eccb0c95d5cb9aecff10bfddac4fee63ed888020 (diff)
download	refcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.tar.gz refcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.zip