diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-01 14:40:47 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-01 14:40:47 +0200 |
commit | 9d8c80f2aef49b6c33e5070c800b7919670075f4 (patch) | |
tree | 6c63f04927d70ce955d862bb8ef3cdadf47a9076 /skate | |
parent | 0f4c1fa49b8c4dbd37349daf09c2d33a8390fb83 (diff) | |
download | refcat-9d8c80f2aef49b6c33e5070c800b7919670075f4.tar.gz refcat-9d8c80f2aef49b6c33e5070c800b7919670075f4.zip |
add container name abbreviation resolver tool
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-resolve-journal-name/main.go | 124 | ||||
-rw-r--r-- | skate/schema.go | 1 | ||||
-rw-r--r-- | skate/schema_test.go | 1 |
3 files changed, 126 insertions, 0 deletions
diff --git a/skate/cmd/skate-resolve-journal-name/main.go b/skate/cmd/skate-resolve-journal-name/main.go new file mode 100644 index 0000000..7f4b53d --- /dev/null +++ b/skate/cmd/skate-resolve-journal-name/main.go @@ -0,0 +1,124 @@ +// skate-resolve-journal-name works on TSV where one column (-f) is a release +// schema. It will inspect the container name and will resolve journal name +// abbreviations and will augment the release document with data in extra +// containing the resolved name. +// +// The abbreviation file format currently is a JSONL format, each document +// containing a "name" and "abbrev" (optional). +package main + +import ( + "bufio" + "bytes" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "os" + "runtime" + "strings" + + "git.archive.org/martin/cgraph/skate" + "git.archive.org/martin/cgraph/skate/parallel" + "git.archive.org/martin/cgraph/skate/set" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + column = flag.Int("f", 2, "column to find the release schema document in (1-indexed)") + sep = flag.String("d", "\t", "delimiter to use") + bestEffort = flag.Bool("B", false, "only log errors, do not halt") + abbrevFile = flag.String("A", "", "path to abbreviate file") +) + +func main() { + flag.Parse() + ms, m, err := createAbbreviationMatchSet(*abbrevFile) + if err != nil { + log.Fatal(err) + } + log.Printf("found %d abbreviation mappings", len(m)) + pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { + var ( + release *skate.Release + err error + ) + fields := bytes.Split(p, []byte(*sep)) + if *column > len(fields) { + return nil, fmt.Errorf("column mismatch") + } + if err = json.Unmarshal(fields[*column-1], &release); err != nil { + return nil, err + } + name := strings.ToLower(release.ContainerName) + results := ms.Lookup(name, -1) + if len(results) != 1 { + // To many or too few matches? We return the document unchanged + return p, nil + } + resolved := m[results[0]] + if len(resolved) != 1 { + // Abbreviation mapping to different full names? Skip. + return p, nil + } + release.Extra.Skate.ResolvedContainerName = resolved[0] + b, err := json.Marshal(release) + if err != nil { + return nil, err + } + fields[*column-1] = b + return bytes.Join(fields, []byte(*sep)), nil + }) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +type NameAbbrev struct { + Name string `json:"name"` + Abbrev string `json:"abbrev"` +} + +// createAbbreviationMatchSet creates a match set from JSONL abbreviations +// file. It also returns a mapping from abbreviations to full journal names for that abbreviation. +func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) { + f, err := os.Open(filename) + if err != nil { + return nil, nil, err + } + defer f.Close() + var ( + sm = make(map[string]set.Set) + m = make(map[string][]string) + br = bufio.NewReader(f) + na NameAbbrev + abbreviations []string + ) + for { + b, err := br.ReadBytes('\n') + if err == io.EOF { + break + } + if err != nil { + return nil, nil, err + } + if err := json.Unmarshal(b, &na); err != nil { + return nil, nil, err + } + key := strings.ToLower(na.Abbrev) + if _, ok := sm[key]; !ok { + sm[key] = set.New() + } + sm[key].Add(strings.ToLower(na.Name)) + } + for k, v := range sm { + abbreviations = append(abbreviations, k) + m[k] = v.Slice() + } + matchSet := skate.NewMatchSet(abbreviations) + return matchSet, m, nil +} diff --git a/skate/schema.go b/skate/schema.go index 4845720..c2107e4 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -232,6 +232,7 @@ type Release struct { ResearchGate struct { URL string `json:"url,omitempty"` } `json:"rg,omitempty"` + ResolvedContainerName string `json:"resolved_container_name"` } `json:"skate,omitempty"` OpenLibrary struct { HasFulltext bool `json:"has_fulltext,omitempty"` diff --git a/skate/schema_test.go b/skate/schema_test.go index 9fe808b..57c4700 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -107,6 +107,7 @@ func TestOpenLibraryToRelease(t *testing.T) { ResearchGate struct { URL string `json:"url,omitempty"` } `json:"rg,omitempty"` + ResolvedContainerName string `json:"resolved_container_name"` } `json:"skate,omitempty"` OpenLibrary struct { HasFulltext bool `json:"has_fulltext,omitempty"` |