aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-wikipedia-doi
diff options
context:
space:
mode:
Diffstat (limited to 'skate/cmd/skate-wikipedia-doi')
-rw-r--r--skate/cmd/skate-wikipedia-doi/main.go8
1 files changed, 7 insertions, 1 deletions
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index fe56576..829352b 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -6,6 +6,7 @@ import (
"fmt"
"log"
"os"
+ "regexp"
"runtime"
"strings"
@@ -19,6 +20,7 @@ var (
batchSize = flag.Int("b", 100000, "batch size")
bytesNewline = []byte("\n")
wsReplacer = strings.NewReplacer("\t", "", "\n", "", " ", "")
+ patDOI = regexp.MustCompile(`(10[.][0-9]{1,8}/[^ ]*[\w])`)
)
func main() {
@@ -32,8 +34,12 @@ func main() {
if ids.DOI == "" || !skate.PatDOI.MatchString(ids.DOI) {
return nil, nil
}
+ match := patDOI.FindStringSubmatch(ids.DOI)
+ if len(match) == 0 {
+ return nil, nil
+ }
var (
- doi = wsReplacer.Replace(ids.DOI)
+ doi = wsReplacer.Replace(match[0])
pageTitle = strings.TrimSpace(w.PageTitle)
s = fmt.Sprintf("%s\t%s\t%s", doi, pageTitle, string(p))
)