aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-12 12:20:31 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-12 12:20:31 +0200
commit27046b7576d92b2cbfe21450ed2ad64ba37acf66 (patch)
tree5130c4cb2519720f05d95388aa6a19249a19e019 /skate
parentc62af895d5c9e8a6b942cb5c24da7cf5e61487a9 (diff)
downloadrefcat-27046b7576d92b2cbfe21450ed2ad64ba37acf66.tar.gz
refcat-27046b7576d92b2cbfe21450ed2ad64ba37acf66.zip
prepare url sanitization
Diffstat (limited to 'skate')
-rw-r--r--skate/url.go45
-rw-r--r--skate/url_test.go39
2 files changed, 84 insertions, 0 deletions
diff --git a/skate/url.go b/skate/url.go
new file mode 100644
index 0000000..ed36b73
--- /dev/null
+++ b/skate/url.go
@@ -0,0 +1,45 @@
+package skate
+
+import (
+ "regexp"
+)
+
+var (
+ patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`)
+ patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
+ patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
+ patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.]?(accessedon|consultado|diaksestanggal|diaksespadatanggal|acesso|accessoem|accessed).*$`)
+ patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`)
+)
+
+// SanitizeURL applies various cleanup rules on URLs as found in references.
+func SanitizeURL(s string) string {
+ // http://!!!:
+ // http://!
+ // http://"
+ s = patNonWordDomain.ReplaceAllString(s, `$1$3`)
+
+ // http:///en.m.wikipedia.org/ChenLong
+ s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`)
+
+ // http://10.1113/jphysiol.2002.026047
+ s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`)
+
+ // http://10.3386/w20634https://doi.org/10.3386/w20634
+
+ // .diaksestanggal27-03-2017.10.30Wib
+ // accessedon15
+ // .Accessed
+ // Acessoem:10/09/2012
+ // .Acesso:11Abr
+ if patAccessedOn.MatchString(s) {
+ s = patAccessedOn.ReplaceAllString(s, `$1`)
+ }
+
+ // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience
+ if patFileExtraSuffix.MatchString(s) {
+ s = patFileExtraSuffix.ReplaceAllString(s, `$1`)
+ }
+
+ return s
+}
diff --git a/skate/url_test.go b/skate/url_test.go
new file mode 100644
index 0000000..5b3992e
--- /dev/null
+++ b/skate/url_test.go
@@ -0,0 +1,39 @@
+package skate
+
+import "testing"
+
+func TestSanitizeURL(t *testing.T) {
+ var cases = []struct {
+ in string
+ out string
+ }{
+ {"", ""},
+ {"http://abc.com", "http://abc.com"},
+ {"http://!!abc.com", "http://abc.com"},
+ {`http://"www.phaelos.com/oubre.html`, `http://www.phaelos.com/oubre.html`},
+ {`http://!www.rkm=journal.de/archives/13383`, `http://www.rkm=journal.de/archives/13383`},
+ {`http:///en.m.wikipedia.org/ChenLong`, `http://en.m.wikipedia.org/ChenLong`},
+ {`http://10.1111/joim.12348`, `https://doi.org/10.1111/joim.12348`},
+ {`http://10.1113/jphysiol.2002.026047`, `https://doi.org/10.1113/jphysiol.2002.026047`},
+ {`http://10.30.3.16/moodle/course/view.php?id=25`, `http://10.30.3.16/moodle/course/view.php?id=25`},
+ {`http://10.3266/RevEspEndocrinolPediatr.pre2015.Nov.330`, `https://doi.org/10.3266/RevEspEndocrinolPediatr.pre2015.Nov.330`},
+ {`http://120.107.180.177/1832/9901/099-2-07p.pdf.Accessed`, `http://120.107.180.177/1832/9901/099-2-07p.pdf`},
+ {`http://120cartas.ig.com.br/wp/maio-de-2008-um-aniversario-de-120-anos/.Acessoem:set`,
+ `http://120cartas.ig.com.br/wp/maio-de-2008-um-aniversario-de-120-anos/`},
+ {`http://122.53.86.125/NNS/8thNNS.pdf.Accessed`, `http://122.53.86.125/NNS/8thNNS.pdf`},
+ {`http://122.53.86.125/facts_figures2011.pdf.Accessedon`,
+ `http://122.53.86.125/facts_figures2011.pdf`},
+ {`http://129.3.20.41/eps/fin/papers/0507/0507016.pdf.diaksespadatanggal23Januari`,
+ `http://129.3.20.41/eps/fin/papers/0507/0507016.pdf`},
+ {`http://129.3.20.41/eps/hew/papers/0512/0512001.pdfAccessed1`,
+ `http://129.3.20.41/eps/hew/papers/0512/0512001.pdf`},
+ {`http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience`,
+ `http://140.120.197.173/Ecology/Download/Timing-MSChart.zip`},
+ }
+ for _, c := range cases {
+ out := SanitizeURL(c.in)
+ if out != c.out {
+ t.Fatalf("got %v, want %v", out, c.out)
+ }
+ }
+}