aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-14 01:07:05 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-14 01:07:05 +0200
commit50fbd0555d3533511de8b703ea67a9dc1f4bc415 (patch)
treece5a17c1c4a19aa6be78e0614893892e56c38b89
parenteccb0c95d5cb9aecff10bfddac4fee63ed888020 (diff)
downloadrefcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.tar.gz
refcat-50fbd0555d3533511de8b703ea67a9dc1f4bc415.zip
cleanup tweaks
-rw-r--r--skate/url.go51
1 files changed, 22 insertions, 29 deletions
diff --git a/skate/url.go b/skate/url.go
index b59103f..d8560ac 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -18,22 +18,10 @@ var (
okSchemas = []string{"http://", "https://", "ftp://"}
)
-// hasAnyPrefixes returns true, if any of the prefixes matches string s.
-func hasAnyPrefix(s string, prefix []string) bool {
- for _, p := range prefix {
- if strings.HasPrefix(s, p) {
- return true
- }
- }
- return false
-}
-
-// SanitizeURL applies various cleanup rules on URLs as found in references.
-// Returns an empty string when no URL could be constructed. Still, many
-// results will not be a URL after all. XXX: Sometimes a URL contains other
-// identifying information, like:
-// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
-// http://10.3386/w20634https://doi.org/10.3386/w20634
+// SanitizeURL applies various cleanup rules on URLs (as they are found in
+// references extracted with GROBID). Returns an empty string when no URL
+// could be discovered. Still, many results will not be a URL even after
+// sanitization.
func SanitizeURL(s string) string {
if !hasAnyPrefix(s, okSchemas) {
s = sanitizeRaw(s)
@@ -47,23 +35,18 @@ func SanitizeURL(s string) string {
indices = index.Lookup([]byte("http"), -1)
)
if len(indices) > 1 {
+ // http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell
s = s[0:indices[1]] // only use the first
s = strings.TrimRight(s, ":")
s = strings.TrimRight(s, ";")
}
- // http://!!!:
- // http://!
- // http://"
+ // http://!!!:, // http://!, // http://"
s = patNonWordDomain.ReplaceAllString(s, `$1$3`)
// http:///en.m.wikipedia.org/ChenLong
s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`)
// http://10.1113/jphysiol.2002.026047
s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
- // .diaksestanggal27-03-2017.10.30Wib
- // accessedon15
- // .Accessed
- // Acessoem:10/09/2012
- // .Acesso:11Abr
+ // .Acesso:11Abr, accessedon15, ...
if patAccessedOn.MatchString(s) {
s = patAccessedOn.ReplaceAllString(s, `$1`)
}
@@ -75,21 +58,31 @@ func SanitizeURL(s string) string {
}
func sanitizeRaw(s string) string {
- index := suffixarray.New([]byte(s))
if len(s) < 4 {
return ""
}
if !strings.Contains(s, ".") {
return ""
}
- indices := index.Lookup([]byte("www."), 1)
+ var (
+ index = suffixarray.New([]byte(s))
+ indices = index.Lookup([]byte("www."), 1)
+ )
if len(indices) > 0 {
return "http://" + s[indices[0]:]
}
if patBrokenSchemaPrefix.MatchString(s) {
return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`)
}
- s = "http://" + s
- return s
- // Re-trievedfrom
+ return "http://" + s
+}
+
+// hasAnyPrefixes returns true, if any of the prefixes matches string s.
+func hasAnyPrefix(s string, prefix []string) bool {
+ for _, p := range prefix {
+ if strings.HasPrefix(s, p) {
+ return true
+ }
+ }
+ return false
}