diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-03-21 01:17:38 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-03-21 01:17:38 +0100 |
commit | 09a7e8c9d013f13a1aa1ef4e9b7f397647b79967 (patch) | |
tree | 122b474e27afbc66cba1182e983ef5c8555ed12f /skate/cmd/skate-biblioref | |
parent | a7e0cf191ebf8fb499e0ab9a3b6cae45727f1286 (diff) | |
download | refcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.tar.gz refcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.zip |
initial import of skate
Diffstat (limited to 'skate/cmd/skate-biblioref')
-rw-r--r-- | skate/cmd/skate-biblioref/main.go | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/skate/cmd/skate-biblioref/main.go b/skate/cmd/skate-biblioref/main.go new file mode 100644 index 0000000..85b2a46 --- /dev/null +++ b/skate/cmd/skate-biblioref/main.go @@ -0,0 +1,139 @@ +// Experimental: Turn the minimal cluster result (key, target, source) into an +// indexable biblio ref (10eb30251f89806cb7a0f147f427c5ea7e5f9941). +// +// Supports multiple input styles transparently, for the moment. +// +// "id style" +// +// ---- id, title, ... --- ---- target -------------- ---- source -------------- +// +// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae icg7bkoeqvfqnc5t5ot4evto6a +// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae ichuaiowbvbx5ajae5ing27lka +// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae io6b76ow6ngxnilc24qsf5kw6i +// +// "verify style" +// +// ---- target ------------------------------------------ ---- source ------------------------------------------ -- match ---- ---- match reason ------------------ +// +// https://fatcat.wiki/release/a6xucdggk5h7bcmbxidvqt7hxe https://fatcat.wiki/release/amnpvj5ma5dxlc2a3x2bm2zbnq Status.STRONG Reason.SLUG_TITLE_AUTHOR_MATCH +// https://fatcat.wiki/release/vyppsuuh2bhapdwcqzln5momta https://fatcat.wiki/release/6gd53yl5yzakrlr72xeojamchi Status.DIFFERENT Reason.CONTRIB_INTERSECTION_EMPTY +// https://fatcat.wiki/release/hazousx6wna5bn5e27s5mrljzq https://fatcat.wiki/release/iajt2xam5nbc3ichkxxuhqaqw4 Status.DIFFERENT Reason.YEAR +// +// Input might change, so we keep this short. +package main + +import ( + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "runtime" + "strconv" + "strings" + "time" + + "github.com/dgraph-io/ristretto" + jsoniter "github.com/json-iterator/go" + "git.archive.org/martin/cgraph/skate" + "git.archive.org/martin/cgraph/skate/parallel" + "github.com/sethgrid/pester" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + extended = flag.Bool("E", false, "fetch data from fatcat API") + + json = jsoniter.ConfigCompatibleWithStandardLibrary + bytesNewline = []byte("\n") + cache *ristretto.Cache + err error +) + +func main() { + flag.Parse() + cache, err = ristretto.NewCache(&ristretto.Config{ + NumCounters: 1e7, // number of keys to track frequency of (10M). + MaxCost: 1 << 30, // maximum cost of cache (1GB). + BufferItems: 64, // number of keys per Get buffer. + }) + if err != nil { + log.Fatal(err) + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { + var ( + fields = strings.Fields(string(p)) + target, source, matchStatus, matchReason, matchProvenance string + ) + switch len(fields) { + case 3: + // Some join output. + source = fields[2] + target = fields[1] + matchProvenance = "join" + case 4: + source = strings.ReplaceAll(fields[1], "https://fatcat.wiki/release/", "") + target = strings.ReplaceAll(fields[0], "https://fatcat.wiki/release/", "") + matchProvenance = "fuzzycat/ebee2de" + matchStatus = strings.ReplaceAll(fields[2], "Status.", "") + matchReason = strings.ReplaceAll(fields[3], "Reason.", "") + } + if source == target { + return nil, nil + } + br := skate.BiblioRef{ + UpdateTs: time.Now().Unix(), + SourceReleaseIdent: source, + TargetReleaseIdent: target, + MatchStatus: matchStatus, + MatchReason: matchReason, + MatchProvenance: matchProvenance, + } + if *extended { + var release skate.Release + if err := FetchRelease(source, &release); err != nil { + log.Fatal(err) + } + br.SourceReleaseStage = release.ReleaseStage + br.SourceWorkIdent = release.WorkID + br.SourceYear = strconv.Itoa(release.ReleaseYear()) + if err := FetchRelease(target, &release); err != nil { + log.Fatal(err) + } + br.TargetWorkIdent = release.WorkID + } + b, err := json.Marshal(br) + b = append(b, bytesNewline...) + return b, err + }) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +func FetchRelease(ident string, release *skate.Release) error { + v, found := cache.Get(ident) + if !found { + link := fmt.Sprintf("https://api.fatcat.wiki/v0/release/%s", ident) + resp, err := pester.Get(link) + if err != nil { + return err + } + defer resp.Body.Close() + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + return err + } + cache.Set(ident, string(b), 1) + return json.Unmarshal(b, release) + } else { + s, ok := v.(string) + if !ok { + return fmt.Errorf("invalid cache value") + } + return json.Unmarshal([]byte(s), release) + } +} |