diff options
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cluster.go | 9 | ||||
-rw-r--r-- | skate/cmd/skate-derive-key/main.go | 30 | ||||
-rw-r--r-- | skate/fixtures/biblioref_v2_10_docs.json | 3 |
3 files changed, 29 insertions, 13 deletions
diff --git a/skate/cluster.go b/skate/cluster.go index 954d971..bec8154 100644 --- a/skate/cluster.go +++ b/skate/cluster.go @@ -110,15 +110,6 @@ func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) { return ident, sandcrawlerSlugify(key), nil } -// KeySourceIdent extracts the source ident. -func KeySourceIdent(p []byte) (ident string, key string, err error) { - var doc IdentTitleDoc - if err = json.Unmarshal(p, &doc); err != nil { - return doc.Ident, doc.Ident, err - } - return doc.Ident, doc.Ident, nil -} - // sandcrawlerSlugify normalizes a string. func sandcrawlerSlugify(s string) string { slug := strings.ToLower(strings.TrimSpace(s)) diff --git a/skate/cmd/skate-derive-key/main.go b/skate/cmd/skate-derive-key/main.go index 653e258..b5b13f2 100644 --- a/skate/cmd/skate-derive-key/main.go +++ b/skate/cmd/skate-derive-key/main.go @@ -30,10 +30,12 @@ import ( "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" + jsoniter "github.com/json-iterator/go" ) var ( - keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand, ident") + keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand") + fixedField = flag.String("F", "", "extract value from a fixed top level field, e.g. source_release_ident, ...") numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 50000, "batch size") verbose = flag.Bool("verbose", false, "show progress") @@ -47,16 +49,36 @@ var ( "tnorm": skate.KeyTitleNormalized, "tnysi": skate.KeyTitleNysiis, "tsand": skate.KeyTitleSandcrawler, - "ident": skate.KeySourceIdent, } keyFunc skate.IdentifierKeyFunc ok bool + json = jsoniter.ConfigCompatibleWithStandardLibrary ) func main() { flag.Parse() - if keyFunc, ok = keyOpts[*keyFuncName]; !ok { - log.Fatal("invalid key func") + if *fixedField != "" { + // We want this, because from biblioref we wanted source_release_ident, + // from refs release_ident, etc. + keyFunc = func(p []byte) (id string, key string, err error) { + var doc map[string]interface{} + if err = json.Unmarshal(p, &doc); err != nil { + return + } + v, ok := doc[*fixedField] + if !ok { + return "", "", nil + } + s, ok := v.(string) + if !ok { + return "", "", nil + } + return "", s, nil + } + } else { + if keyFunc, ok = keyOpts[*keyFuncName]; !ok { + log.Fatal("invalid key func") + } } if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) diff --git a/skate/fixtures/biblioref_v2_10_docs.json b/skate/fixtures/biblioref_v2_10_docs.json new file mode 100644 index 0000000..bf1e9b0 --- /dev/null +++ b/skate/fixtures/biblioref_v2_10_docs.json @@ -0,0 +1,3 @@ +{"_id":"djulycilxfegzmmf3oud2ctt3e_34","update_ts":1616550415,"source_release_ident":"djulycilxfegzmmf3oud2ctt3e","source_work_ident":"5r6wyyk2szfbhatxdhijqrjife","source_year":"2018","ref_index":34,"ref_key":"CIT0034","target_release_ident":"tcvffh4gfre5nadizpf4f2pcgm","target_work_ident":"hnl4df4vtfcrjbwdv652bhz7ky","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"qfln4z2rjbh25bjkdtgqfgbr5y_7","update_ts":1616550415,"source_release_ident":"qfln4z2rjbh25bjkdtgqfgbr5y","source_work_ident":"6tvnvlstyjbwlncoyo3y7eobfu","source_year":"2015","ref_index":7,"ref_key":"b6","target_release_ident":"fr2xflnbmfff3gx2vrxywllmp4","target_work_ident":"uyzsagyjd5gytoltooagyqy66u","match_provenance":"join","match_status":"exact","match_reason":"doi"} +{"_id":"cql2z4z2mzdtne26lbdclaf2mi_18","update_ts":1616550415,"source_release_ident":"cql2z4z2mzdtne26lbdclaf2mi","source_work_ident":"e3r7czvqhzbjzny2h2y5r4j76y","source_year":"2009","ref_index":18,"ref_key":"10.1111/j.1538-7836.2009.03685.x-BIB18|cit18","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"} |