diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-13 23:22:26 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-13 23:22:26 +0200 |
commit | eccb0c95d5cb9aecff10bfddac4fee63ed888020 (patch) | |
tree | b9630979c8bd8d4e11afb989b77a0c1aee5dae03 | |
parent | f0e41af0498a85773fbba28cb8552152365a612f (diff) | |
download | refcat-eccb0c95d5cb9aecff10bfddac4fee63ed888020.tar.gz refcat-eccb0c95d5cb9aecff10bfddac4fee63ed888020.zip |
url cleanup tweaks
-rw-r--r-- | skate/url.go | 24 | ||||
-rw-r--r-- | skate/url_test.go | 20 |
2 files changed, 38 insertions, 6 deletions
diff --git a/skate/url.go b/skate/url.go index f81b1d2..b59103f 100644 --- a/skate/url.go +++ b/skate/url.go @@ -12,8 +12,22 @@ var ( patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`) + patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`) + patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`) + + okSchemas = []string{"http://", "https://", "ftp://"} ) +// hasAnyPrefixes returns true, if any of the prefixes matches string s. +func hasAnyPrefix(s string, prefix []string) bool { + for _, p := range prefix { + if strings.HasPrefix(s, p) { + return true + } + } + return false +} + // SanitizeURL applies various cleanup rules on URLs as found in references. // Returns an empty string when no URL could be constructed. Still, many // results will not be a URL after all. XXX: Sometimes a URL contains other @@ -21,7 +35,7 @@ var ( // http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. // http://10.3386/w20634https://doi.org/10.3386/w20634 func SanitizeURL(s string) string { - if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") { + if !hasAnyPrefix(s, okSchemas) { s = sanitizeRaw(s) if s == "" { return s @@ -70,10 +84,12 @@ func sanitizeRaw(s string) string { } indices := index.Lookup([]byte("www."), 1) if len(indices) > 0 { - s = "http://" + s[indices[0]:] - } else { - s = "http://" + s + return "http://" + s[indices[0]:] + } + if patBrokenSchemaPrefix.MatchString(s) { + return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`) } + s = "http://" + s return s // Re-trievedfrom } diff --git a/skate/url_test.go b/skate/url_test.go index d866e7a..1a439a6 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -8,6 +8,10 @@ func TestSanitizeURL(t *testing.T) { out string }{ {"", ""}, + {"a", ""}, + {"???", ""}, + {"???***", ""}, + {"???***___123", ""}, {"http://abc.com", "http://abc.com"}, {"http://!!abc.com", "http://abc.com"}, {`http://"www.phaelos.com/oubre.html`, `http://www.phaelos.com/oubre.html`}, @@ -108,6 +112,18 @@ func TestSanitizeURL(t *testing.T) { `http://www.crd.york.ac.uk/PROSPERO/display_record.asp?ID=CRD42014009228`}, {`ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`, `ftp://ftp.ncbi.nih.gov/genomes/Bacteria/`}, + {`ftp-eng.cisco.com/sobgp/index.html`, + `http://ftp-eng.cisco.com/sobgp/index.html`}, + {`ftp.cdc.gov/pub/Publications/mmwr/SS/SS4703.pdf`, + `http://ftp.cdc.gov/pub/Publications/mmwr/SS/SS4703.pdf`}, + {`ftpftp.inria.fr`, + `http://ftpftp.inria.fr`}, + {`http.bglink.com/personal/batakovic`, `http://bglink.com/personal/batakovic`}, + {`http.kalsel.bps.go.id`, `http://kalsel.bps.go.id`}, + {`http.www.admhmao.ru/people/frame.htm`, `http://www.admhmao.ru/people/frame.htm`}, + {`http.worldbank.org/sq`, `http://worldbank.org/sq`}, + {`httpwww.sun.com`, `http://www.sun.com`}, + {`httpswww.unos.org`, `http://www.unos.org`}, } for _, c := range cases { out := SanitizeURL(c.in) @@ -122,8 +138,8 @@ func BenchmarkSanitizeURL(b *testing.B) { name string in string }{ - {"http", `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`}, - {"plain", `0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`}, + {`http`, `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`}, + {`plain`, `0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`}, } for _, bm := range bms { b.Run(bm.name, func(b *testing.B) { |