diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-06 20:13:30 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-06 20:13:30 +0200 |
commit | 6e100b23bf846f36025092a83d433e6cf4f0a9b6 (patch) | |
tree | 015948b142aff27b724285b5dabc5e724f178d4d /skate | |
parent | 6c327acbf5799dde9c153843ac3ba1471e88317c (diff) | |
download | refcat-6e100b23bf846f36025092a83d433e6cf4f0a9b6.tar.gz refcat-6e100b23bf846f36025092a83d433e6cf4f0a9b6.zip |
add a url extractor mapper
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-map/main.go | 1 | ||||
-rw-r--r-- | skate/map.go | 17 |
2 files changed, 18 insertions, 0 deletions
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 227acf2..9bf2d14 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -72,6 +72,7 @@ func main() { "tn": skate.MapperTitleNormalized, "ty": skate.MapperTitleNysiis, "ts": skate.MapperTitleSandcrawler, + "ru": skate.MapperURLFromRef, } if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) diff --git a/skate/map.go b/skate/map.go index d6e37be..459558d 100644 --- a/skate/map.go +++ b/skate/map.go @@ -181,6 +181,23 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { return fields, nil } +// MapperURLFromRef extracts the work, release ident, url and doc. Previously: +// parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, .release_ident, +// .biblio.url?] | @tsv'" ... +func MapperURLFromRef(p []byte) (fields [][]byte, err error) { + var ref Ref + if err := json.Unmarshal(p, &ref); err != nil { + return nil, err + } + fields := [][]byte{ + []byte(ref.WorkIdent), + []byte(ref.ReleaseIdent), + []byte(ref.Biblio.Url), + p, + } + return fields, nil +} + // MapperPartial works on partial documents. func MapperPartial(p []byte) (fields [][]byte, err error) { // XXX: slugify authors, how to compare two author strings? How do these |