diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-09 21:40:13 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-09 21:40:13 +0200 |
commit | dd39ba1bd24f64b8bc65c075d9f1f095304a4d29 (patch) | |
tree | 933c580ccec50f5b9587bf54b1322547833c758d /skate | |
parent | d6c7e0259676da1645b7d261058acd4613fac4dd (diff) | |
download | refcat-dd39ba1bd24f64b8bc65c075d9f1f095304a4d29.tar.gz refcat-dd39ba1bd24f64b8bc65c075d9f1f095304a4d29.zip |
wiki: tweak whitespace handling
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-wikipedia-doi/main.go | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go index fe56576..829352b 100644 --- a/skate/cmd/skate-wikipedia-doi/main.go +++ b/skate/cmd/skate-wikipedia-doi/main.go @@ -6,6 +6,7 @@ import ( "fmt" "log" "os" + "regexp" "runtime" "strings" @@ -19,6 +20,7 @@ var ( batchSize = flag.Int("b", 100000, "batch size") bytesNewline = []byte("\n") wsReplacer = strings.NewReplacer("\t", "", "\n", "", " ", "") + patDOI = regexp.MustCompile(`(10[.][0-9]{1,8}/[^ ]*[\w])`) ) func main() { @@ -32,8 +34,12 @@ func main() { if ids.DOI == "" || !skate.PatDOI.MatchString(ids.DOI) { return nil, nil } + match := patDOI.FindStringSubmatch(ids.DOI) + if len(match) == 0 { + return nil, nil + } var ( - doi = wsReplacer.Replace(ids.DOI) + doi = wsReplacer.Replace(match[0]) pageTitle = strings.TrimSpace(w.PageTitle) s = fmt.Sprintf("%s\t%s\t%s", doi, pageTitle, string(p)) ) |