aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-13 23:22:26 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-13 23:22:26 +0200
commiteccb0c95d5cb9aecff10bfddac4fee63ed888020 (patch)
treeb9630979c8bd8d4e11afb989b77a0c1aee5dae03
parentf0e41af0498a85773fbba28cb8552152365a612f (diff)
downloadrefcat-eccb0c95d5cb9aecff10bfddac4fee63ed888020.tar.gz
refcat-eccb0c95d5cb9aecff10bfddac4fee63ed888020.zip
url cleanup tweaks
-rw-r--r--skate/url.go24
-rw-r--r--skate/url_test.go20
2 files changed, 38 insertions, 6 deletions
diff --git a/skate/url.go b/skate/url.go
index f81b1d2..b59103f 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -12,8 +12,22 @@ var (
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)
+ patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`)
+ patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`)
+
+ okSchemas = []string{"http://", "https://", "ftp://"}
)
+// hasAnyPrefixes returns true, if any of the prefixes matches string s.
+func hasAnyPrefix(s string, prefix []string) bool {
+ for _, p := range prefix {
+ if strings.HasPrefix(s, p) {
+ return true
+ }
+ }
+ return false
+}
+
// SanitizeURL applies various cleanup rules on URLs as found in references.
// Returns an empty string when no URL could be constructed. Still, many
// results will not be a URL after all. XXX: Sometimes a URL contains other
@@ -21,7 +35,7 @@ var (
// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
// http://10.3386/w20634https://doi.org/10.3386/w20634
func SanitizeURL(s string) string {
- if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") {
+ if !hasAnyPrefix(s, okSchemas) {
s = sanitizeRaw(s)
if s == "" {
return s
@@ -70,10 +84,12 @@ func sanitizeRaw(s string) string {
}
indices := index.Lookup([]byte("www."), 1)
if len(indices) > 0 {
- s = "http://" + s[indices[0]:]
- } else {
- s = "http://" + s
+ return "http://" + s[indices[0]:]
+ }
+ if patBrokenSchemaPrefix.MatchString(s) {
+ return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`)
}
+ s = "http://" + s
return s
// Re-trievedfrom
}
diff --git a/skate/url_test.go b/skate/url_test.go
index d866e7a..1a439a6 100644
--- a/skate/url_test.go
+++ b/skate/url_test.go
@@ -8,6 +8,10 @@ func TestSanitizeURL(t *testing.T) {
out string
}{
{"", ""},
+ {"a", ""},
+ {"???", ""},
+ {"???***", ""},
+ {"???***___123", ""},
{"http://abc.com", "http://abc.com"},
{"http://!!abc.com", "http://abc.com"},
{`http://"www.phaelos.com/oubre.html`, `http://www.phaelos.com/oubre.html`},
@@ -108,6 +112,18 @@ func TestSanitizeURL(t *testing.T) {
`http://www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`},
{`ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`,
`ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`},
+ {`ftp-eng.cisco.com/sobgp/index.html`,
+ `http://ftp-eng.cisco.com/sobgp/index.html`},
+ {`ftp.cdc.gov/pub/Publications/mmwr/SS/SS4703.pdf`,
+ `http://ftp.cdc.gov/pub/Publications/mmwr/SS/SS4703.pdf`},
+ {`ftpftp.inria.fr`,
+ `http://ftpftp.inria.fr`},
+ {`http.bglink.com/personal/batakovic`, `http://bglink.com/personal/batakovic`},
+ {`http.kalsel.bps.go.id`, `http://kalsel.bps.go.id`},
+ {`http.www.admhmao.ru/people/frame.htm`, `http://www.admhmao.ru/people/frame.htm`},
+ {`http.worldbank.org/sq`, `http://worldbank.org/sq`},
+ {`httpwww.sun.com`, `http://www.sun.com`},
+ {`httpswww.unos.org`, `http://www.unos.org`},
}
for _, c := range cases {
out := SanitizeURL(c.in)
@@ -122,8 +138,8 @@ func BenchmarkSanitizeURL(b *testing.B) {
name string
in string
}{
- {"http", `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`},
- {"plain", `0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`},
+ {`http`, `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`},
+ {`plain`, `0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`},
}
for _, bm := range bms {
b.Run(bm.name, func(b *testing.B) {