From 8714537ecffa0641516bc01b8cdc4cdd9a9d975c Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 20 May 2021 14:31:04 +0200 Subject: wip: cdx lookup --- skate/cdx.go | 73 +++++++++++++++++++++++++++++++++++--- skate/cmd/skate-cdx-lookup/main.go | 10 ++++-- skate/go.mod | 4 +-- skate/go.sum | 7 ++++ 4 files changed, 84 insertions(+), 10 deletions(-) diff --git a/skate/cdx.go b/skate/cdx.go index f98b781..ed394ed 100644 --- a/skate/cdx.go +++ b/skate/cdx.go @@ -5,15 +5,17 @@ import ( "io/ioutil" "log" "net/http" + "sort" "strings" + "time" "github.com/sethgrid/pester" ) const cdxApi = "http://web.archive.org/cdx/search/cdx" -// CDX line, might add more fields later. -type CDX struct { +// CDXLine line, might add more fields later. +type CDXLine struct { Surt string Date string Link string @@ -23,13 +25,74 @@ type CDX struct { Size string } +type CDX []CDXLine + +type ByDate CDX + +func (b ByDate) Len() int { + return len(b) +} + +func (b ByDate) Swap(i, j int) { + b[i], b[j] = b[j], b[i] +} + +func (b ByDate) Less(i, j int) bool { + return b[i].Date < b[j].Date +} + +func (c CDX) Summary() string { + var ( + dateLast = "NA" + dateLastOK = "NA" + delta = "NA" + ) + if len(c) == 0 { + return fmt.Sprintf("last=%s ok=%s", dateLast, dateLastOK) + } + sort.Sort(sort.Reverse(ByDate(c))) + dateLast = c[0].Date + for _, cdx := range c { + if cdx.StatusCode == "200" { + dateLastOK = cdx.Date + break + } + } + d, err := tsDiff(dateLast, dateLastOK) + if err == nil { + if d.Hours()/24 > 365 { + delta = fmt.Sprintf("\033[31;1;4m%0.0f\033[0m", d.Hours()/24) + } else { + delta = fmt.Sprintf("%0.0f", d.Hours()/24) + } + } + return fmt.Sprintf("last=%s ok=%s delta=%v", dateLast, dateLastOK, delta) +} + +// tsDiff returns the duration between two timestamps, like: 20140304124333. +func tsDiff(a, b string) (time.Duration, error) { + ta, err := time.Parse("20060102150405", a) + if err != nil { + return 0, err + } + tb, err := time.Parse("20060102150405", b) + if err != nil { + return 0, err + } + if ta.Before(tb) { + return tb.Sub(ta), nil + } else { + return ta.Sub(tb), nil + } +} + // LookupCDX asks CDX API. Result will be like: // net,ijmse)/uploadfile/2016/1214/20161214052559646.pdf 20170516210333 // http://www.ijmse.net:80/uploadfile/2016/1214/20161214052559646.pdf // application/pdf 200 PBPHE2OILTB43TAOUO33GBWLE2SS4LQX 2079755 // // Also returns the raw response body. -func LookupCDX(link string) (result []CDX, b []byte, err error) { +func LookupCDX(link string) (result CDX, b []byte, err error) { link = prependSchema(link) cdxlink := fmt.Sprintf("%s?url=%s", cdxApi, link) log.Printf("[lookup] %s", cdxlink) @@ -50,7 +113,7 @@ func LookupCDX(link string) (result []CDX, b []byte, err error) { return result, b, err } -func ParseCDX(b []byte) (result []CDX, err error) { +func ParseCDX(b []byte) (result CDX, err error) { for _, line := range strings.Split(string(b), "\n") { var fields = strings.Fields(line) if len(fields) == 0 { @@ -60,7 +123,7 @@ func ParseCDX(b []byte) (result []CDX, err error) { log.Printf("short line: %s", line) continue } - cdx := CDX{ + cdx := CDXLine{ Surt: fields[0], Date: fields[1], Link: fields[2], diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go index e26102f..b480078 100644 --- a/skate/cmd/skate-cdx-lookup/main.go +++ b/skate/cmd/skate-cdx-lookup/main.go @@ -36,8 +36,11 @@ func main() { if *quiet { log.SetOutput(ioutil.Discard) } - var cache = skate.Cache{Dir: *cacheDir} - br := bufio.NewReader(r) + var ( + cache = skate.Cache{Dir: *cacheDir} + br = bufio.NewReader(r) + i int + ) for { line, err := br.ReadString('\n') if err == io.EOF { @@ -66,6 +69,7 @@ func main() { if err != nil { log.Fatal(err) } - fmt.Printf("% 10d %s\n", len(rows), line) + fmt.Printf("[%05d] % 10d %s %s\n", i, len(rows), rows.Summary(), line) + i++ } } diff --git a/skate/go.mod b/skate/go.mod index 911296f..a3d7501 100644 --- a/skate/go.mod +++ b/skate/go.mod @@ -3,14 +3,14 @@ module git.archive.org/martin/cgraph/skate go 1.15 require ( - github.com/adrg/xdg v0.3.3 // indirect + github.com/adrg/xdg v0.3.3 github.com/elastic/go-elasticsearch v0.0.0 github.com/elastic/go-elasticsearch/v7 v7.12.0 github.com/klauspost/cpuid/v2 v2.0.6 // indirect github.com/matryer/is v1.4.0 github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e github.com/segmentio/encoding v0.2.17 - github.com/sethgrid/pester v1.1.0 // indirect + github.com/sethgrid/pester v1.1.0 github.com/tidwall/gjson v1.7.5 golang.org/x/text v0.3.6 mvdan.cc/xurls/v2 v2.2.0 diff --git a/skate/go.sum b/skate/go.sum index b6a108f..da37d40 100644 --- a/skate/go.sum +++ b/skate/go.sum @@ -1,5 +1,6 @@ github.com/adrg/xdg v0.3.3 h1:s/tV7MdqQnzB1nKY8aqHvAMD+uCiuEDzVB5HLRY849U= github.com/adrg/xdg v0.3.3/go.mod h1:61xAR2VZcggl2St4O9ohF5qCKe08+JDmE4VNzPFQvOQ= +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWnBO0y+TZaA= github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg= @@ -8,13 +9,16 @@ github.com/elastic/go-elasticsearch/v7 v7.12.0/go.mod h1:OJ4wdbtDNk5g503kvlHLyEr github.com/klauspost/cpuid/v2 v2.0.5/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE= github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU= github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e h1:S+/ptYdZtpK/MDstwCyt+ZHdXEpz86RJZ5gyZU4txJY= github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e/go.mod h1:uFMI8w+ref4v2r9jz+c9i1IfIttS/OkmLfrk1jne5hs= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/segmentio/encoding v0.2.17 h1:cgfmPc44u1po1lz5bSgF00gLCROBjDNc7h+H7I20zpc= @@ -22,6 +26,7 @@ github.com/segmentio/encoding v0.2.17/go.mod h1:7E68jTSWMnNoYhHi1JbLd7NBSB6XfE4v github.com/sethgrid/pester v1.1.0 h1:IyEAVvwSUPjs2ACFZkBe5N59BBUpSIkQ71Hr6cM5A+w= github.com/sethgrid/pester v1.1.0/go.mod h1:Ad7IjTpvzZO8Fl0vh9AzQ+j/jYZfyp2diGwI8m5q+ns= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/tidwall/gjson v1.7.5 h1:zmAN/xmX7OtpAkv4Ovfso60r/BiCi5IErCDYGNJu+uc= github.com/tidwall/gjson v1.7.5/go.mod h1:5/xDoumyyDNerp2U36lyolv46b3uF/9Bu6OfyQ9GImk= @@ -34,8 +39,10 @@ golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= mvdan.cc/xurls/v2 v2.2.0 h1:NSZPykBXJFCetGZykLAxaL6SIpvbVy/UFEniIfHAa8A= mvdan.cc/xurls/v2 v2.2.0/go.mod h1:EV1RMtya9D6G5DMYPGD8zTQzaHet6Jh8gFlRgGRJeO8= -- cgit v1.2.3