From d28e0ceb6eb1b7e96236d9b311a6d4c9f2fa0f73 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 12 May 2021 21:02:51 +0200 Subject: url cleanup: more test cases --- skate/url.go | 16 ++++++++++++---- skate/url_test.go | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/skate/url.go b/skate/url.go index 1ffff66..539613d 100644 --- a/skate/url.go +++ b/skate/url.go @@ -1,6 +1,7 @@ package skate import ( + "index/suffixarray" "regexp" ) @@ -8,12 +9,21 @@ var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) - patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|lastaccessed|acesso|accessoem|accessed|recibido|accessedat).*$`) - patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) + patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) + patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`) ) // SanitizeURL applies various cleanup rules on URLs as found in references. +// XXX: Sometimes a URL contains other identifying information, like: +// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. +// http://10.3386/w20634https://doi.org/10.3386/w20634 func SanitizeURL(s string) string { + index := suffixarray.New([]byte(s)) + indices := index.Lookup([]byte("http"), -1) + if len(indices) > 1 { + s = s[0:indices[1]] // only use the first + } + // http://!!!: // http://! // http://" @@ -25,8 +35,6 @@ func SanitizeURL(s string) string { // http://10.1113/jphysiol.2002.026047 s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) - // http://10.3386/w20634https://doi.org/10.3386/w20634 - // .diaksestanggal27-03-2017.10.30Wib // accessedon15 // .Accessed diff --git a/skate/url_test.go b/skate/url_test.go index e1daf30..30462d7 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -67,6 +67,31 @@ func TestSanitizeURL(t *testing.T) { `http://195.20.232.142/img/Schwerpunktnewsletter_Oesterreich_Bibliotheken.pdf`}, {`http://aalc07.psu.edu/papers/jn_typol_class3.pdf.Stanford`, `http://aalc07.psu.edu/papers/jn_typol_class3.pdf`}, + {`http://aboriginalhealth.flinders.edu.au/Newsletters/2010/Downloads/SHRP%20FINAL%20REPORT%20PART%20TWO%20July%202009.pdfAccessed14/12/2012`, + `http://aboriginalhealth.flinders.edu.au/Newsletters/2010/Downloads/SHRP%20FINAL%20REPORT%20PART%20TWO%20July%202009.pdf`}, + {`http://about-air.ru/svojstva-vozduha/davlenie-vozduha/normalnoe-atmosfernoe-davlenie.html,доступ-свободный,датаобращения15.04.2017`, + `http://about-air.ru/svojstva-vozduha/davlenie-vozduha/normalnoe-atmosfernoe-davlenie.html`}, + {`http://acl.ldc.upenn.edu/W/W98/W98-1120.pdfDateofaccess`, + `http://acl.ldc.upenn.edu/W/W98/W98-1120.pdf`}, + {`http://acl.mit.edu/pa-pers/2012-uber-conference-submitted.pdf//49thIEEE`, + `http://acl.mit.edu/pa-pers/2012-uber-conference-submitted.pdf`}, + {`http://acoss.org.au/policy/community_services/emergency_relief_handbook/,accessed1st`, + `http://acoss.org.au/policy/community_services/emergency_relief_handbook/`}, + {`http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/.Ac-cessedon06/12/2016`, + `http://acrf.com.au/2012/world-firsthpv-vaccina-tion-plan-will-protect-young-australian-men-from-cancer/`}, + {`http://acta.uta.fi/pdf/951-44-4701-6.pdfRHEINDORF`, + `http://acta.uta.fi/pdf/951-44-4701-6.pdf`}, + {`http://admi.net/jo/20080423/ECE-C0771649A.html.Pageconsultéele25septembre`, + `http://admi.net/jo/20080423/ECE-C0771649A.html`}, + {`http://admin.localgov.co.uk/his_localgov/view/images/uploaded/Image/childrensblackpool.PDF.Lastaccess8`, + `http://admin.localgov.co.uk/his_localgov/view/images/uploaded/Image/childrensblackpool.PDF`}, + {`http://aec.ifas.ufl.edu/abrams/step/critical_litreview.pdfİndirme`, + `http://aec.ifas.ufl.edu/abrams/step/critical_litreview.pdf`}, + {`http://aem.asm.org/Downloadedfrom`, `http://aem.asm.org/`}, + {`http://aem.asm.org/content/67/6/2766.full.pdf+htmlWITTWER`, + `http://aem.asm.org/content/67/6/2766.full.pdf+html`}, + {`http://agris.fao.org/agris-search/search.do?recordID=BR2013800115https://doi.org/10.5747/ca.2010.v06.n1.a044`, + `http://agris.fao.org/agris-search/search.do?recordID=BR2013800115`}, } for _, c := range cases { out := SanitizeURL(c.in) -- cgit v1.2.3