aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-12 22:27:56 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-12 22:27:56 +0200
commit0471704cccb6a6ce950d68dbc5debb9be7db2286 (patch)
treefda7d85cbef9e30b6169ec85f92325786ce072f4 /skate
parentae3f3f3fba70dbf986f3a79a06fef8f09263360b (diff)
downloadrefcat-0471704cccb6a6ce950d68dbc5debb9be7db2286.tar.gz
refcat-0471704cccb6a6ce950d68dbc5debb9be7db2286.zip
address raw urls, w/o http prefix
Diffstat (limited to 'skate')
-rw-r--r--skate/url.go25
-rw-r--r--skate/url_test.go8
2 files changed, 31 insertions, 2 deletions
diff --git a/skate/url.go b/skate/url.go
index f0edddf..66b9312 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -11,7 +11,7 @@ var (
patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
- patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`)
+ patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)
)
// SanitizeURL applies various cleanup rules on URLs as found in references.
@@ -24,6 +24,12 @@ func SanitizeURL(s string) string {
index = suffixarray.New([]byte(s))
indices = index.Lookup([]byte("http"), -1)
)
+ if !strings.HasPrefix(s, "http") {
+ s = sanitizeRaw(s, index)
+ if s == "" {
+ return s
+ }
+ }
if len(indices) > 1 {
s = s[0:indices[1]] // only use the first
s = strings.TrimRight(s, ":")
@@ -51,3 +57,20 @@ func SanitizeURL(s string) string {
}
return s
}
+
+func sanitizeRaw(s string, index *suffixarray.Index) string {
+ if len(s) < 4 {
+ return ""
+ }
+ if !strings.Contains(s, ".") {
+ return ""
+ }
+ indices := index.Lookup([]byte("www."), 1)
+ if len(indices) > 0 {
+ s = "http://" + s[indices[0]:]
+ } else {
+ s = "http://" + s
+ }
+ return s
+ // Re-trievedfrom
+}
diff --git a/skate/url_test.go b/skate/url_test.go
index c138a0e..42ec45b 100644
--- a/skate/url_test.go
+++ b/skate/url_test.go
@@ -30,7 +30,7 @@ func TestSanitizeURL(t *testing.T) {
{`http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience`,
`http://140.120.197.173/Ecology/Download/Timing-MSChart.zip`},
{`141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`,
- `141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`},
+ `http://141.213.232.243/bitstream/handle/2027.42/86336/apterc_1.pdf?sequence=1`},
{`http://141.232.10.32/pm/recover/recover_docs/perf_measures/062812_rec_pm_scs_salinity_flbay.pdfRECOVER`,
`http://141.232.10.32/pm/recover/recover_docs/perf_measures/062812_rec_pm_scs_salinity_flbay.pdf`},
{`http://2010.census.gov/news/releases/operations/cb11-cn125.html.lastaccessed4`,
@@ -98,6 +98,12 @@ func TestSanitizeURL(t *testing.T) {
`http://aim.bmj.com/content/31/1/23.full.pdf+html`},
{`http://ainfo.cnptia.embrapa.br/digital/bitstream/CNPAT-2010/8608/1/Ci-017.pdfAcessed06`,
`http://ainfo.cnptia.embrapa.br/digital/bitstream/CNPAT-2010/8608/1/Ci-017.pdf`},
+ {`12s`, ``},
+ {`12spoaspdop`, ``},
+ {`0.0.www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`,
+ `http://www.epcglobalinc.org/standards_technology/Secure/v1.0/UHF-class1.pdf`},
+ {`CRAN.R-project.org/package=vegan`,
+ `http://CRAN.R-project.org/package=vegan`},
}
for _, c := range cases {
out := SanitizeURL(c.in)