aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-biblioref
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-21 01:17:38 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-03-21 01:17:38 +0100
commit09a7e8c9d013f13a1aa1ef4e9b7f397647b79967 (patch)
tree122b474e27afbc66cba1182e983ef5c8555ed12f /skate/cmd/skate-biblioref
parenta7e0cf191ebf8fb499e0ab9a3b6cae45727f1286 (diff)
downloadrefcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.tar.gz
refcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.zip
initial import of skate
Diffstat (limited to 'skate/cmd/skate-biblioref')
-rw-r--r--skate/cmd/skate-biblioref/main.go139
1 files changed, 139 insertions, 0 deletions
diff --git a/skate/cmd/skate-biblioref/main.go b/skate/cmd/skate-biblioref/main.go
new file mode 100644
index 0000000..85b2a46
--- /dev/null
+++ b/skate/cmd/skate-biblioref/main.go
@@ -0,0 +1,139 @@
+// Experimental: Turn the minimal cluster result (key, target, source) into an
+// indexable biblio ref (10eb30251f89806cb7a0f147f427c5ea7e5f9941).
+//
+// Supports multiple input styles transparently, for the moment.
+//
+// "id style"
+//
+// ---- id, title, ... --- ---- target -------------- ---- source --------------
+//
+// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae icg7bkoeqvfqnc5t5ot4evto6a
+// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae ichuaiowbvbx5ajae5ing27lka
+// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae io6b76ow6ngxnilc24qsf5kw6i
+//
+// "verify style"
+//
+// ---- target ------------------------------------------ ---- source ------------------------------------------ -- match ---- ---- match reason ------------------
+//
+// https://fatcat.wiki/release/a6xucdggk5h7bcmbxidvqt7hxe https://fatcat.wiki/release/amnpvj5ma5dxlc2a3x2bm2zbnq Status.STRONG Reason.SLUG_TITLE_AUTHOR_MATCH
+// https://fatcat.wiki/release/vyppsuuh2bhapdwcqzln5momta https://fatcat.wiki/release/6gd53yl5yzakrlr72xeojamchi Status.DIFFERENT Reason.CONTRIB_INTERSECTION_EMPTY
+// https://fatcat.wiki/release/hazousx6wna5bn5e27s5mrljzq https://fatcat.wiki/release/iajt2xam5nbc3ichkxxuhqaqw4 Status.DIFFERENT Reason.YEAR
+//
+// Input might change, so we keep this short.
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "os"
+ "runtime"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/dgraph-io/ristretto"
+ jsoniter "github.com/json-iterator/go"
+ "git.archive.org/martin/cgraph/skate"
+ "git.archive.org/martin/cgraph/skate/parallel"
+ "github.com/sethgrid/pester"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 100000, "batch size")
+ extended = flag.Bool("E", false, "fetch data from fatcat API")
+
+ json = jsoniter.ConfigCompatibleWithStandardLibrary
+ bytesNewline = []byte("\n")
+ cache *ristretto.Cache
+ err error
+)
+
+func main() {
+ flag.Parse()
+ cache, err = ristretto.NewCache(&ristretto.Config{
+ NumCounters: 1e7, // number of keys to track frequency of (10M).
+ MaxCost: 1 << 30, // maximum cost of cache (1GB).
+ BufferItems: 64, // number of keys per Get buffer.
+ })
+ if err != nil {
+ log.Fatal(err)
+ }
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
+ var (
+ fields = strings.Fields(string(p))
+ target, source, matchStatus, matchReason, matchProvenance string
+ )
+ switch len(fields) {
+ case 3:
+ // Some join output.
+ source = fields[2]
+ target = fields[1]
+ matchProvenance = "join"
+ case 4:
+ source = strings.ReplaceAll(fields[1], "https://fatcat.wiki/release/", "")
+ target = strings.ReplaceAll(fields[0], "https://fatcat.wiki/release/", "")
+ matchProvenance = "fuzzycat/ebee2de"
+ matchStatus = strings.ReplaceAll(fields[2], "Status.", "")
+ matchReason = strings.ReplaceAll(fields[3], "Reason.", "")
+ }
+ if source == target {
+ return nil, nil
+ }
+ br := skate.BiblioRef{
+ UpdateTs: time.Now().Unix(),
+ SourceReleaseIdent: source,
+ TargetReleaseIdent: target,
+ MatchStatus: matchStatus,
+ MatchReason: matchReason,
+ MatchProvenance: matchProvenance,
+ }
+ if *extended {
+ var release skate.Release
+ if err := FetchRelease(source, &release); err != nil {
+ log.Fatal(err)
+ }
+ br.SourceReleaseStage = release.ReleaseStage
+ br.SourceWorkIdent = release.WorkID
+ br.SourceYear = strconv.Itoa(release.ReleaseYear())
+ if err := FetchRelease(target, &release); err != nil {
+ log.Fatal(err)
+ }
+ br.TargetWorkIdent = release.WorkID
+ }
+ b, err := json.Marshal(br)
+ b = append(b, bytesNewline...)
+ return b, err
+ })
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
+
+func FetchRelease(ident string, release *skate.Release) error {
+ v, found := cache.Get(ident)
+ if !found {
+ link := fmt.Sprintf("https://api.fatcat.wiki/v0/release/%s", ident)
+ resp, err := pester.Get(link)
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+ b, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ return err
+ }
+ cache.Set(ident, string(b), 1)
+ return json.Unmarshal(b, release)
+ } else {
+ s, ok := v.(string)
+ if !ok {
+ return fmt.Errorf("invalid cache value")
+ }
+ return json.Unmarshal([]byte(s), release)
+ }
+}