aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-09 21:40:13 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-09 21:40:13 +0200
commitdd39ba1bd24f64b8bc65c075d9f1f095304a4d29 (patch)
tree933c580ccec50f5b9587bf54b1322547833c758d
parentd6c7e0259676da1645b7d261058acd4613fac4dd (diff)
downloadrefcat-dd39ba1bd24f64b8bc65c075d9f1f095304a4d29.tar.gz
refcat-dd39ba1bd24f64b8bc65c075d9f1f095304a4d29.zip
wiki: tweak whitespace handling
-rw-r--r--skate/cmd/skate-wikipedia-doi/main.go8
1 files changed, 7 insertions, 1 deletions
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index fe56576..829352b 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -6,6 +6,7 @@ import (
"fmt"
"log"
"os"
+ "regexp"
"runtime"
"strings"
@@ -19,6 +20,7 @@ var (
batchSize = flag.Int("b", 100000, "batch size")
bytesNewline = []byte("\n")
wsReplacer = strings.NewReplacer("\t", "", "\n", "", " ", "")
+ patDOI = regexp.MustCompile(`(10[.][0-9]{1,8}/[^ ]*[\w])`)
)
func main() {
@@ -32,8 +34,12 @@ func main() {
if ids.DOI == "" || !skate.PatDOI.MatchString(ids.DOI) {
return nil, nil
}
+ match := patDOI.FindStringSubmatch(ids.DOI)
+ if len(match) == 0 {
+ return nil, nil
+ }
var (
- doi = wsReplacer.Replace(ids.DOI)
+ doi = wsReplacer.Replace(match[0])
pageTitle = strings.TrimSpace(w.PageTitle)
s = fmt.Sprintf("%s\t%s\t%s", doi, pageTitle, string(p))
)