aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/cmd/skate-resolve-journal-name/main.go124
-rw-r--r--skate/schema.go1
-rw-r--r--skate/schema_test.go1
3 files changed, 126 insertions, 0 deletions
diff --git a/skate/cmd/skate-resolve-journal-name/main.go b/skate/cmd/skate-resolve-journal-name/main.go
new file mode 100644
index 0000000..7f4b53d
--- /dev/null
+++ b/skate/cmd/skate-resolve-journal-name/main.go
@@ -0,0 +1,124 @@
+// skate-resolve-journal-name works on TSV where one column (-f) is a release
+// schema. It will inspect the container name and will resolve journal name
+// abbreviations and will augment the release document with data in extra
+// containing the resolved name.
+//
+// The abbreviation file format currently is a JSONL format, each document
+// containing a "name" and "abbrev" (optional).
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "runtime"
+ "strings"
+
+ "git.archive.org/martin/cgraph/skate"
+ "git.archive.org/martin/cgraph/skate/parallel"
+ "git.archive.org/martin/cgraph/skate/set"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 100000, "batch size")
+ column = flag.Int("f", 2, "column to find the release schema document in (1-indexed)")
+ sep = flag.String("d", "\t", "delimiter to use")
+ bestEffort = flag.Bool("B", false, "only log errors, do not halt")
+ abbrevFile = flag.String("A", "", "path to abbreviate file")
+)
+
+func main() {
+ flag.Parse()
+ ms, m, err := createAbbreviationMatchSet(*abbrevFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ log.Printf("found %d abbreviation mappings", len(m))
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
+ var (
+ release *skate.Release
+ err error
+ )
+ fields := bytes.Split(p, []byte(*sep))
+ if *column > len(fields) {
+ return nil, fmt.Errorf("column mismatch")
+ }
+ if err = json.Unmarshal(fields[*column-1], &release); err != nil {
+ return nil, err
+ }
+ name := strings.ToLower(release.ContainerName)
+ results := ms.Lookup(name, -1)
+ if len(results) != 1 {
+ // To many or too few matches? We return the document unchanged
+ return p, nil
+ }
+ resolved := m[results[0]]
+ if len(resolved) != 1 {
+ // Abbreviation mapping to different full names? Skip.
+ return p, nil
+ }
+ release.Extra.Skate.ResolvedContainerName = resolved[0]
+ b, err := json.Marshal(release)
+ if err != nil {
+ return nil, err
+ }
+ fields[*column-1] = b
+ return bytes.Join(fields, []byte(*sep)), nil
+ })
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
+
+type NameAbbrev struct {
+ Name string `json:"name"`
+ Abbrev string `json:"abbrev"`
+}
+
+// createAbbreviationMatchSet creates a match set from JSONL abbreviations
+// file. It also returns a mapping from abbreviations to full journal names for that abbreviation.
+func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) {
+ f, err := os.Open(filename)
+ if err != nil {
+ return nil, nil, err
+ }
+ defer f.Close()
+ var (
+ sm = make(map[string]set.Set)
+ m = make(map[string][]string)
+ br = bufio.NewReader(f)
+ na NameAbbrev
+ abbreviations []string
+ )
+ for {
+ b, err := br.ReadBytes('\n')
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ return nil, nil, err
+ }
+ if err := json.Unmarshal(b, &na); err != nil {
+ return nil, nil, err
+ }
+ key := strings.ToLower(na.Abbrev)
+ if _, ok := sm[key]; !ok {
+ sm[key] = set.New()
+ }
+ sm[key].Add(strings.ToLower(na.Name))
+ }
+ for k, v := range sm {
+ abbreviations = append(abbreviations, k)
+ m[k] = v.Slice()
+ }
+ matchSet := skate.NewMatchSet(abbreviations)
+ return matchSet, m, nil
+}
diff --git a/skate/schema.go b/skate/schema.go
index 4845720..c2107e4 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -232,6 +232,7 @@ type Release struct {
ResearchGate struct {
URL string `json:"url,omitempty"`
} `json:"rg,omitempty"`
+ ResolvedContainerName string `json:"resolved_container_name"`
} `json:"skate,omitempty"`
OpenLibrary struct {
HasFulltext bool `json:"has_fulltext,omitempty"`
diff --git a/skate/schema_test.go b/skate/schema_test.go
index 9fe808b..57c4700 100644
--- a/skate/schema_test.go
+++ b/skate/schema_test.go
@@ -107,6 +107,7 @@ func TestOpenLibraryToRelease(t *testing.T) {
ResearchGate struct {
URL string `json:"url,omitempty"`
} `json:"rg,omitempty"`
+ ResolvedContainerName string `json:"resolved_container_name"`
} `json:"skate,omitempty"`
OpenLibrary struct {
HasFulltext bool `json:"has_fulltext,omitempty"`