aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/url.go16
-rw-r--r--skate/url_test.go25
2 files changed, 37 insertions, 4 deletions
diff --git a/skate/url.go b/skate/url.go
index 1ffff66..539613d 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -1,6 +1,7 @@
package skate
import (
+ "index/suffixarray"
"regexp"
)
@@ -8,12 +9,21 @@ var (
patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
- patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`)
- patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`)
+ patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
+ patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`)
)
// SanitizeURL applies various cleanup rules on URLs as found in references.
+// XXX: Sometimes a URL contains other identifying information, like:
+// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
+// http://10.3386/w20634https://doi.org/10.3386/w20634
func SanitizeURL(s string) string {
+ index := suffixarray.New([]byte(s))
+ indices := index.Lookup([]byte("http"), -1)
+ if len(indices) > 1 {
+ s = s[0:indices[1]] // only use the first
+ }
+
// http://!!!:
// http://!
// http://"
@@ -25,8 +35,6 @@ func SanitizeURL(s string) string {
// http://10.1113/jphysiol.2002.026047
s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
- // http://10.3386/w20634https://doi.org/10.3386/w20634
-
// .diaksestanggal27-03-2017.10.30Wib
// accessedon15
// .Accessed
diff --git a/skate/url_test.go b/skate/url_test.go
index e1daf30..30462d7 100644
--- a/skate/url_test.go
+++ b/skate/url_test.go
@@ -67,6 +67,31 @@ func TestSanitizeURL(t *testing.T) {
`http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf`},
{`http://aalc07.psu.edu/papers/jn_typol_class3.pdf.Stanford`,
`http://aalc07.psu.edu/papers/jn_typol_class3.pdf`},
+ {`http://aboriginalhealth.flinders.edu.au/Newsletters/2010/Downloads/SHRP%20FINAL%20REPORT%20PART%20TWO%20July%202009.pdfAccessed14/12/2012`,
+ `http://aboriginalhealth.flinders.edu.au/Newsletters/2010/Downloads/SHRP%20FINAL%20REPORT%20PART%20TWO%20July%202009.pdf`},
+ {`http://about-air.ru/svojstva-vozduha/davlenie-vozduha/normalnoe-atmosfernoe-davlenie.html,доступ-свободный,датаобращения15.04.2017`,
+ `http://about-air.ru/svojstva-vozduha/davlenie-vozduha/normalnoe-atmosfernoe-davlenie.html`},
+ {`http://acl.ldc.upenn.edu/W/W98/W98-1120.pdfDateofaccess`,
+ `http://acl.ldc.upenn.edu/W/W98/W98-1120.pdf`},
+ {`http://acl.mit.edu/pa-pers/2012-uber-conference-submitted.pdf//49thIEEE`,
+ `http://acl.mit.edu/pa-pers/2012-uber-conference-submitted.pdf`},
+ {`http://acoss.org.au/policy/community_services/emergency_relief_handbook/,accessed1st`,
+ `http://acoss.org.au/policy/community_services/emergency_relief_handbook/`},
+ {`http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/.Ac-cessedon06/12/2016`,
+ `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`},
+ {`http://acta.uta.fi/pdf/951-44-4701-6.pdfRHEINDORF`,
+ `http://acta.uta.fi/pdf/951-44-4701-6.pdf`},
+ {`http://admi.net/jo/20080423/ECE-C0771649A.html.Pageconsultéele25septembre`,
+ `http://admi.net/jo/20080423/ECE-C0771649A.html`},
+ {`http://admin.localgov.co.uk/his_localgov/view/images/uploaded/Image/childrensblackpool.PDF.Lastaccess8`,
+ `http://admin.localgov.co.uk/his_localgov/view/images/uploaded/Image/childrensblackpool.PDF`},
+ {`http://aec.ifas.ufl.edu/abrams/step/critical_litreview.pdfİndirme`,
+ `http://aec.ifas.ufl.edu/abrams/step/critical_litreview.pdf`},
+ {`http://aem.asm.org/Downloadedfrom`, `http://aem.asm.org/`},
+ {`http://aem.asm.org/content/67/6/2766.full.pdf+htmlWITTWER`,
+ `http://aem.asm.org/content/67/6/2766.full.pdf+html`},
+ {`http://agris.fao.org/agris-search/search.do?recordID=BR2013800115https://doi.org/10.5747/ca.2010.v06.n1.a044`,
+ `http://agris.fao.org/agris-search/search.do?recordID=BR2013800115`},
}
for _, c := range cases {
out := SanitizeURL(c.in)