diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-05 16:12:55 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-05 16:12:55 +0200 |
commit | 048641f7672e2ff04a80d4486d0c21bd61369f0f (patch) | |
tree | a32822f05d484ebb979a2ad39a7483781e118288 /skate | |
parent | 634b7b7d910ddb20c5af0722de41ef5ccded7358 (diff) | |
download | refcat-048641f7672e2ff04a80d4486d0c21bd61369f0f.tar.gz refcat-048641f7672e2ff04a80d4486d0c21bd61369f0f.zip |
skate-conv tweaks
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-conv/main.go | 53 | ||||
-rw-r--r-- | skate/cmd/skate-from-unstructured/main.go | 2 | ||||
-rw-r--r-- | skate/schema.go | 5 | ||||
-rw-r--r-- | skate/verify.go | 6 |
4 files changed, 36 insertions, 30 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go index 647472e..34e79a3 100644 --- a/skate/cmd/skate-conv/main.go +++ b/skate/cmd/skate-conv/main.go @@ -21,10 +21,9 @@ import ( var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") - fromFormat = flag.String("f", "ref", "import schema") + fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol") - bytesNewline = []byte("\n") - f func([]byte) ([]byte, error) + f func([]byte) ([]byte, error) // our converter function ) func main() { @@ -36,6 +35,8 @@ func main() { f = rgSitemapToRelease case "ol": f = openLibraryToRelease + default: + log.Fatal("unsupported input schema: %v", *fromFormat) } pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) pp.NumWorkers = *numWorkers @@ -45,55 +46,61 @@ func main() { } } -// refToRelease converts a ref document to a release. +// refToRelease converts a ref document to a release. The standard conversion +// plus some extra fields. func refToRelease(p []byte) ([]byte, error) { - var ref skate.Ref - if err := json.Unmarshal(p, &ref); err != nil { + var ( + ref skate.Ref + release *skate.Release + err error + ) + if err = json.Unmarshal(p, &ref); err != nil { return nil, err } - release, err := skate.RefToRelease(&ref) - if err != nil { + if release, err = skate.RefToRelease(&ref); err != nil { return nil, err } release.Extra.Skate.Status = "ref" // means: converted from ref release.Extra.Skate.Ref.Index = ref.Index release.Extra.Skate.Ref.Key = ref.Key - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err + return skate.JsonMarshalNewline(release) } +// rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from +// https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst. func rgSitemapToRelease(p []byte) ([]byte, error) { var ( s skate.Sitemap release skate.Release + err error ) - if err := json.Unmarshal(p, &s); err != nil { + if err = json.Unmarshal(p, &s); err != nil { return nil, err } release.Title = s.Title if len(s.URL) > 41 { - // XXX: A pseudo ident, maybe irritating. + // A pseudo ident, maybe irritating; we want the "321885388". + // https://www.researchgate.net/publication/321885388_We_came_here_on_dif release.Ident = strings.Split(s.URL[41:], "_")[0] } release.Extra.Skate.Status = "rg" release.Extra.Skate.ResearchGate.URL = s.URL - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err + return skate.JsonMarshalNewline(release) } +// openLibraryToRelease converts an Open Library work item to a release. func openLibraryToRelease(p []byte) ([]byte, error) { - var w skate.OpenLibraryWork - if err := json.Unmarshal(p, &w); err != nil { + var ( + w skate.OpenLibraryWork + release *skate.Release + err error + ) + if err = json.Unmarshal(p, &w); err != nil { return nil, err } - release, err := skate.OpenLibraryToRelease(&w) - if err != nil { + if release, err = skate.OpenLibraryToRelease(&w); err != nil { return nil, err } release.Extra.Skate.Status = "ol" - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err + return skate.JsonMarshalNewline(release) } diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index 179057d..2ccdd7d 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -28,7 +28,7 @@ func main() { if err := skate.ParseUnstructured(&ref); err != nil { return nil, err } - return skate.JsonMarshalLine(&ref) + return skate.JsonMarshalNewline(&ref) }) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize diff --git a/skate/schema.go b/skate/schema.go index 9f3af45..1878205 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -27,8 +27,7 @@ var ( } ) -// RefToRelease converts a ref to a release. Set a extra.skate.status flag to -// be able to distinguish converted entities later. +// RefToRelease converts a ref to a release. func RefToRelease(ref *Ref) (*Release, error) { var ( release Release @@ -201,7 +200,7 @@ type Release struct { Relations []DataCiteRelation `json:"relations,omitempty"` } `json:"datacite,omitempty"` Skate struct { - // Mark as converted (e.g. by setting status to "ref") + // Mark as converted from "ref", "rg" or other schemas. Status string `json:"status,omitempty"` // Carry the ref index and key around. Ref struct { diff --git a/skate/verify.go b/skate/verify.go index e6ab03e..d1f98f0 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -124,8 +124,8 @@ var ( PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) ) -// JsonMarshalLine marshals a value as JSON and adds a newline. -func JsonMarshalLine(v interface{}) ([]byte, error) { +// JsonMarshalNewline marshals a value as JSON and adds a newline. +func JsonMarshalNewline(v interface{}) ([]byte, error) { b, err := json.Marshal(v) if err != nil { return nil, err @@ -234,7 +234,7 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) { continue // Assume we already have the DOI matches. } br = generateBiblioRef(re, pivot, result, "fuzzy") - return JsonMarshalLine(br) + return JsonMarshalNewline(br) default: continue } |