aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/cluster.go9
-rw-r--r--skate/cmd/skate-derive-key/main.go30
-rw-r--r--skate/fixtures/biblioref_v2_10_docs.json3
3 files changed, 29 insertions, 13 deletions
diff --git a/skate/cluster.go b/skate/cluster.go
index 954d971..bec8154 100644
--- a/skate/cluster.go
+++ b/skate/cluster.go
@@ -110,15 +110,6 @@ func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) {
return ident, sandcrawlerSlugify(key), nil
}
-// KeySourceIdent extracts the source ident.
-func KeySourceIdent(p []byte) (ident string, key string, err error) {
- var doc IdentTitleDoc
- if err = json.Unmarshal(p, &doc); err != nil {
- return doc.Ident, doc.Ident, err
- }
- return doc.Ident, doc.Ident, nil
-}
-
// sandcrawlerSlugify normalizes a string.
func sandcrawlerSlugify(s string) string {
slug := strings.ToLower(strings.TrimSpace(s))
diff --git a/skate/cmd/skate-derive-key/main.go b/skate/cmd/skate-derive-key/main.go
index 653e258..b5b13f2 100644
--- a/skate/cmd/skate-derive-key/main.go
+++ b/skate/cmd/skate-derive-key/main.go
@@ -30,10 +30,12 @@ import (
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
+ jsoniter "github.com/json-iterator/go"
)
var (
- keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand, ident")
+ keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand")
+ fixedField = flag.String("F", "", "extract value from a fixed top level field, e.g. source_release_ident, ...")
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 50000, "batch size")
verbose = flag.Bool("verbose", false, "show progress")
@@ -47,16 +49,36 @@ var (
"tnorm": skate.KeyTitleNormalized,
"tnysi": skate.KeyTitleNysiis,
"tsand": skate.KeyTitleSandcrawler,
- "ident": skate.KeySourceIdent,
}
keyFunc skate.IdentifierKeyFunc
ok bool
+ json = jsoniter.ConfigCompatibleWithStandardLibrary
)
func main() {
flag.Parse()
- if keyFunc, ok = keyOpts[*keyFuncName]; !ok {
- log.Fatal("invalid key func")
+ if *fixedField != "" {
+ // We want this, because from biblioref we wanted source_release_ident,
+ // from refs release_ident, etc.
+ keyFunc = func(p []byte) (id string, key string, err error) {
+ var doc map[string]interface{}
+ if err = json.Unmarshal(p, &doc); err != nil {
+ return
+ }
+ v, ok := doc[*fixedField]
+ if !ok {
+ return "", "", nil
+ }
+ s, ok := v.(string)
+ if !ok {
+ return "", "", nil
+ }
+ return "", s, nil
+ }
+ } else {
+ if keyFunc, ok = keyOpts[*keyFuncName]; !ok {
+ log.Fatal("invalid key func")
+ }
}
if *logFile != "" {
f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
diff --git a/skate/fixtures/biblioref_v2_10_docs.json b/skate/fixtures/biblioref_v2_10_docs.json
new file mode 100644
index 0000000..bf1e9b0
--- /dev/null
+++ b/skate/fixtures/biblioref_v2_10_docs.json
@@ -0,0 +1,3 @@
+{"_id":"djulycilxfegzmmf3oud2ctt3e_34","update_ts":1616550415,"source_release_ident":"djulycilxfegzmmf3oud2ctt3e","source_work_ident":"5r6wyyk2szfbhatxdhijqrjife","source_year":"2018","ref_index":34,"ref_key":"CIT0034","target_release_ident":"tcvffh4gfre5nadizpf4f2pcgm","target_work_ident":"hnl4df4vtfcrjbwdv652bhz7ky","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"qfln4z2rjbh25bjkdtgqfgbr5y_7","update_ts":1616550415,"source_release_ident":"qfln4z2rjbh25bjkdtgqfgbr5y","source_work_ident":"6tvnvlstyjbwlncoyo3y7eobfu","source_year":"2015","ref_index":7,"ref_key":"b6","target_release_ident":"fr2xflnbmfff3gx2vrxywllmp4","target_work_ident":"uyzsagyjd5gytoltooagyqy66u","match_provenance":"join","match_status":"exact","match_reason":"doi"}
+{"_id":"cql2z4z2mzdtne26lbdclaf2mi_18","update_ts":1616550415,"source_release_ident":"cql2z4z2mzdtne26lbdclaf2mi","source_work_ident":"e3r7czvqhzbjzny2h2y5r4j76y","source_year":"2009","ref_index":18,"ref_key":"10.1111/j.1538-7836.2009.03685.x-BIB18|cit18","target_release_ident":"eedkg7qjufcqnamezd67vlcbru","target_work_ident":"jyi6malei5bmfhtepalmgu2rzq","match_provenance":"join","match_status":"exact","match_reason":"doi"}