aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-31 23:35:34 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-03-31 23:35:34 +0200
commitcedef349990c12747efdbe10f692c57436dc5c91 (patch)
treeb71d6b8a2f908e1ef5c0f15779f79fddbcd3a482 /skate
parentac416f9a0f15597a9a0391e3112cefcecb814d05 (diff)
downloadrefcat-cedef349990c12747efdbe10f692c57436dc5c91.tar.gz
refcat-cedef349990c12747efdbe10f692c57436dc5c91.zip
remote skate-biblioref
Diffstat (limited to 'skate')
-rw-r--r--skate/Makefile2
-rw-r--r--skate/README.md17
-rw-r--r--skate/cmd/skate-biblioref/main.go139
3 files changed, 17 insertions, 141 deletions
diff --git a/skate/Makefile b/skate/Makefile
index 3574f4d..ccfbe9e 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-biblioref skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-biblioref-from-wikipedia
+TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-cluster-stats skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-biblioref-from-wikipedia
PKGNAME := skate
.PHONY: test
diff --git a/skate/README.md b/skate/README.md
index 2892190..6c39717 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -1,6 +1,21 @@
# skate
-Key extractors and zipping tools.
+The skate suite of command line tools have been written for various parts of the
+citation graph pipeline.
+
+## Tools
+
+### skate-biblioref
+### skate-biblioref-from-wikipedia
+### skate-bref-id
+### skate-cluster
+### skate-cluster-stats
+### skate-derive-key
+### skate-from-unstructured
+### skate-ref-to-release
+### skate-to-doi
+### skate-verify
+
Goal: make key extraction and comparisons fast for billions of records on a
single machine to support deduplication work for [fatcat](https://fatcat.wiki)
diff --git a/skate/cmd/skate-biblioref/main.go b/skate/cmd/skate-biblioref/main.go
deleted file mode 100644
index d16c99b..0000000
--- a/skate/cmd/skate-biblioref/main.go
+++ /dev/null
@@ -1,139 +0,0 @@
-// Experimental: Turn the minimal cluster result (key, target, source) into an
-// indexable biblio ref (10eb30251f89806cb7a0f147f427c5ea7e5f9941).
-//
-// Supports multiple input styles transparently, for the moment.
-//
-// "id style"
-//
-// ---- id, title, ... --- ---- target -------------- ---- source --------------
-//
-// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae icg7bkoeqvfqnc5t5ot4evto6a
-// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae ichuaiowbvbx5ajae5ing27lka
-// 10.1001/2012.jama.11164 zhscs2mjlvcdte2i3j44ibgzae io6b76ow6ngxnilc24qsf5kw6i
-//
-// "verify style"
-//
-// ---- target ------------------------------------------ ---- source ------------------------------------------ -- match ---- ---- match reason ------------------
-//
-// https://fatcat.wiki/release/a6xucdggk5h7bcmbxidvqt7hxe https://fatcat.wiki/release/amnpvj5ma5dxlc2a3x2bm2zbnq Status.STRONG Reason.SLUG_TITLE_AUTHOR_MATCH
-// https://fatcat.wiki/release/vyppsuuh2bhapdwcqzln5momta https://fatcat.wiki/release/6gd53yl5yzakrlr72xeojamchi Status.DIFFERENT Reason.CONTRIB_INTERSECTION_EMPTY
-// https://fatcat.wiki/release/hazousx6wna5bn5e27s5mrljzq https://fatcat.wiki/release/iajt2xam5nbc3ichkxxuhqaqw4 Status.DIFFERENT Reason.YEAR
-//
-// Input might change, so we keep this short.
-package main
-
-import (
- "flag"
- "fmt"
- "io/ioutil"
- "log"
- "os"
- "runtime"
- "strconv"
- "strings"
- "time"
-
- "git.archive.org/martin/cgraph/skate"
- "git.archive.org/martin/cgraph/skate/parallel"
- "github.com/dgraph-io/ristretto"
- jsoniter "github.com/json-iterator/go"
- "github.com/sethgrid/pester"
-)
-
-var (
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 100000, "batch size")
- extended = flag.Bool("E", false, "fetch data from fatcat API")
-
- json = jsoniter.ConfigCompatibleWithStandardLibrary
- bytesNewline = []byte("\n")
- cache *ristretto.Cache
- err error
-)
-
-func main() {
- flag.Parse()
- cache, err = ristretto.NewCache(&ristretto.Config{
- NumCounters: 1e7, // number of keys to track frequency of (10M).
- MaxCost: 1 << 30, // maximum cost of cache (1GB).
- BufferItems: 64, // number of keys per Get buffer.
- })
- if err != nil {
- log.Fatal(err)
- }
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
- var (
- fields = strings.Fields(string(p))
- target, source, matchStatus, matchReason, matchProvenance string
- )
- switch len(fields) {
- case 3:
- // Some join output.
- source = fields[2]
- target = fields[1]
- matchProvenance = "join"
- case 4:
- source = strings.ReplaceAll(fields[1], "https://fatcat.wiki/release/", "")
- target = strings.ReplaceAll(fields[0], "https://fatcat.wiki/release/", "")
- matchProvenance = "fuzzycat/ebee2de"
- matchStatus = strings.ReplaceAll(fields[2], "Status.", "")
- matchReason = strings.ReplaceAll(fields[3], "Reason.", "")
- }
- if source == target {
- return nil, nil
- }
- br := skate.BiblioRef{
- IndexedTs: time.Now().Unix(),
- SourceReleaseIdent: source,
- TargetReleaseIdent: target,
- MatchStatus: matchStatus,
- MatchReason: matchReason,
- MatchProvenance: matchProvenance,
- }
- if *extended {
- var release skate.Release
- if err := FetchRelease(source, &release); err != nil {
- log.Fatal(err)
- }
- br.SourceReleaseStage = release.ReleaseStage
- br.SourceWorkIdent = release.WorkID
- br.SourceYear = strconv.Itoa(release.ReleaseYear())
- if err := FetchRelease(target, &release); err != nil {
- log.Fatal(err)
- }
- br.TargetWorkIdent = release.WorkID
- }
- b, err := json.Marshal(br)
- b = append(b, bytesNewline...)
- return b, err
- })
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
-}
-
-func FetchRelease(ident string, release *skate.Release) error {
- v, found := cache.Get(ident)
- if !found {
- link := fmt.Sprintf("https://api.fatcat.wiki/v0/release/%s", ident)
- resp, err := pester.Get(link)
- if err != nil {
- return err
- }
- defer resp.Body.Close()
- b, err := ioutil.ReadAll(resp.Body)
- if err != nil {
- return err
- }
- cache.Set(ident, string(b), 1)
- return json.Unmarshal(b, release)
- } else {
- s, ok := v.(string)
- if !ok {
- return fmt.Errorf("invalid cache value")
- }
- return json.Unmarshal([]byte(s), release)
- }
-}