From 6e100b23bf846f36025092a83d433e6cf4f0a9b6 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 6 May 2021 20:13:30 +0200 Subject: add a url extractor mapper --- skate/cmd/skate-map/main.go | 1 + skate/map.go | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'skate') diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 227acf2..9bf2d14 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -72,6 +72,7 @@ func main() { "tn": skate.MapperTitleNormalized, "ty": skate.MapperTitleNysiis, "ts": skate.MapperTitleSandcrawler, + "ru": skate.MapperURLFromRef, } if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) diff --git a/skate/map.go b/skate/map.go index d6e37be..459558d 100644 --- a/skate/map.go +++ b/skate/map.go @@ -181,6 +181,23 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { return fields, nil } +// MapperURLFromRef extracts the work, release ident, url and doc. Previously: +// parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, .release_ident, +// .biblio.url?] | @tsv'" ... +func MapperURLFromRef(p []byte) (fields [][]byte, err error) { + var ref Ref + if err := json.Unmarshal(p, &ref); err != nil { + return nil, err + } + fields := [][]byte{ + []byte(ref.WorkIdent), + []byte(ref.ReleaseIdent), + []byte(ref.Biblio.Url), + p, + } + return fields, nil +} + // MapperPartial works on partial documents. func MapperPartial(p []byte) (fields [][]byte, err error) { // XXX: slugify authors, how to compare two author strings? How do these -- cgit v1.2.3