diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 22:27:56 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 22:27:56 +0200 |
commit | 0471704cccb6a6ce950d68dbc5debb9be7db2286 (patch) | |
tree | fda7d85cbef9e30b6169ec85f92325786ce072f4 /skate | |
parent | ae3f3f3fba70dbf986f3a79a06fef8f09263360b (diff) | |
download | refcat-0471704cccb6a6ce950d68dbc5debb9be7db2286.tar.gz refcat-0471704cccb6a6ce950d68dbc5debb9be7db2286.zip |
address raw urls, w/o http prefix
Diffstat (limited to 'skate')
-rw-r--r-- | skate/url.go | 25 | ||||
-rw-r--r-- | skate/url_test.go | 8 |
2 files changed, 31 insertions, 2 deletions
diff --git a/skate/url.go b/skate/url.go index f0edddf..66b9312 100644 --- a/skate/url.go +++ b/skate/url.go @@ -11,7 +11,7 @@ var ( patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) - patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`) + patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`) ) // SanitizeURL applies various cleanup rules on URLs as found in references. @@ -24,6 +24,12 @@ func SanitizeURL(s string) string { index = suffixarray.New([]byte(s)) indices = index.Lookup([]byte("http"), -1) ) + if !strings.HasPrefix(s, "http") { + s = sanitizeRaw(s, index) + if s == "" { + return s + } + } if len(indices) > 1 { s = s[0:indices[1]] // only use the first s = strings.TrimRight(s, ":") @@ -51,3 +57,20 @@ func SanitizeURL(s string) string { } return s } + +func sanitizeRaw(s string, index *suffixarray.Index) string { + if len(s) < 4 { + return "" + } + if !strings.Contains(s, ".") { + return "" + } + indices := index.Lookup([]byte("www."), 1) + if len(indices) > 0 { + s = "http://" + s[indices[0]:] + } else { + s = "http://" + s + } + return s + // Re-trievedfrom +} diff --git a/skate/url_test.go b/skate/url_test.go index c138a0e..42ec45b 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -30,7 +30,7 @@ func TestSanitizeURL(t *testing.T) { {`http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience`, `http://140.120.197.173/Ecology/Download/Timing-MSChart.zip`}, {`141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`, - `141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`}, + `http://141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`}, {`http://141.232.10.32/pm/recover/recover_docs/perf_measures/062812_rec_pm_scs_salinity_flbay.pdfRECOVER`, `http://141.232.10.32/pm/recover/recover_docs/perf_measures/062812_rec_pm_scs_salinity_flbay.pdf`}, {`http://2010.census.gov/news/releases/operations/cb11-cn125.html.lastaccessed4`, @@ -98,6 +98,12 @@ func TestSanitizeURL(t *testing.T) { `http://aim.bmj.com/content/31/1/23.full.pdf+html`}, {`http://ainfo.cnptia.embrapa.br/digital/bitstream/CNPAT-2010/8608/1/Ci-017.pdfAcessed06`, `http://ainfo.cnptia.embrapa.br/digital/bitstream/CNPAT-2010/8608/1/Ci-017.pdf`}, + {`12s`, ``}, + {`12spoaspdop`, ``}, + {`0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`, + `http://www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`}, + {`CRAN.R-project.org/package=vegan`, + `http://CRAN.R-project.org/package=vegan`}, } for _, c := range cases { out := SanitizeURL(c.in) |