diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-08-05 13:17:01 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-08-05 13:17:01 +0200 |
commit | df126d7252bea6d3877cb67211a34f14788aa358 (patch) | |
tree | a30e9e0ffcda60f983c6ba02dafc1bcd55eb4459 | |
parent | 769ee237046a8553583e0414e1f56877b7f1a847 (diff) | |
download | refcat-df126d7252bea6d3877cb67211a34f14788aa358.tar.gz refcat-df126d7252bea6d3877cb67211a34f14788aa358.zip |
tasks: use a mapper
-rw-r--r-- | python/refcat/tasks.py | 3 | ||||
-rw-r--r-- | skate/cmd/skate-map/main.go | 1 | ||||
-rw-r--r-- | skate/map.go | 19 | ||||
-rw-r--r-- | skate/schema.go | 2 |
4 files changed, 23 insertions, 2 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 573ec74..66175c8 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -448,8 +448,7 @@ class BrefDOITable(Refcat): def run(self): output = shellout(""" zstdcat -T0 {input} | - parallel --block 10M -j 20 --pipe - "jq -rc '[.source_release_ident, .target_release_ident, .source_doi, .target_doi] | @tsv'" | + skate-map -m bidt | zstd -c -T0 > {output} """, input=self.input().path) diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 6c61af0..57a1498 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -83,6 +83,7 @@ func main() { "cdxu": skate.MapperCdxSummary, "bref": skate.MapperBrefWork, "rewo": skate.MapperReleaseWork, + "bidt": skate.MapperBrefIdentifierTable, } if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) diff --git a/skate/map.go b/skate/map.go index ca98186..ad62328 100644 --- a/skate/map.go +++ b/skate/map.go @@ -415,6 +415,25 @@ func MapperReleaseWork(p []byte) (fields [][]byte, err error) { return [][]byte{[]byte(release.WorkID), p}, nil } +// MapperBrefIdentifierTable generates an id table from biblioref. +func MapperBrefIdentifierTable(p []byte) (field [][]byte, err error) { + var bref struct { + SourceReleaseIdent string `json:"source_release_ident,omitempty"` + TargetReleaseIdent string `json:"target_release_ident,omitempty"` + SourceDOI string `json:"source_doi,omitempty"` + TargetDOI string `json:"target_doi,omitempty"` + } + if err := json.Unmarshal(p, &bref); err != nil { + return nil, err + } + return [][]byte{ + []byte(bref.SourceReleaseIdent), + []byte(bref.TargetReleaseIdent), + []byte(bref.SourceDOI), + []byte(bref.TargetDOI), + }, nil +} + // sandcrawlerSlugify normalizes a string. func sandcrawlerSlugify(s string) string { slug := strings.ToLower(strings.TrimSpace(s)) diff --git a/skate/schema.go b/skate/schema.go index 2cea15a..078556c 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -438,6 +438,8 @@ type BiblioRef struct { MatchReason string `json:"match_reason,omitempty"` TargetUnstructured string `json:"target_unstructured,omitempty"` TargetCSL *CSL `json:"target_csl,omitempty"` + SourceDOI string `json:"source_doi,omitempty"` + TargetDOI string `json:"target_doi,omitempty"` } // CSL is a subset of citation style language schema. |