package skate import ( "fmt" "io/ioutil" "log" "net/http" "sort" "strings" "time" "github.com/sethgrid/pester" ) const cdxApi = "http://web.archive.org/cdx/search/cdx" // CDXLine line, might add more fields later. type CDXLine struct { Surt string Date string Link string ContentType string StatusCode string Checksum string Size string } // CDX contains a number of CDX records. type CDX []CDXLine // ByDate sorts CDX records by date. type ByDate CDX func (b ByDate) Len() int { return len(b) } func (b ByDate) Swap(i, j int) { b[i], b[j] = b[j], b[i] } func (b ByDate) Less(i, j int) bool { return b[i].Date < b[j].Date } type CDXSummary struct { Last string `json:"last"` LastOK string `json:"ok"` DeltaDays int `json:"delta"` } func (s *CDXSummary) String() string { return fmt.Sprintf("last=%s ok=%s delta=%v", s.Last, s.LastOK, s.DeltaDays) } // Summary prints a short, task-specific summary to stdout. func (c CDX) Summary() *CDXSummary { var ( dateLast = "NA" dateLastOK = "NA" ) if len(c) == 0 { return &CDXSummary{Last: dateLast, LastOK: dateLastOK} } sort.Sort(sort.Reverse(ByDate(c))) dateLast = c[0].Date for _, cdx := range c { if cdx.StatusCode == "200" { dateLastOK = cdx.Date break } } d, err := tsDiff(dateLast, dateLastOK) if err != nil { return &CDXSummary{Last: dateLast, LastOK: dateLastOK} } else { return &CDXSummary{Last: dateLast, LastOK: dateLastOK, DeltaDays: int(d.Hours() / 24)} } } // tsDiff returns the duration between two strings timestamps, like: // "20140304124333". func tsDiff(a, b string) (time.Duration, error) { ta, err := time.Parse("20060102150405", a) if err != nil { return 0, err } tb, err := time.Parse("20060102150405", b) if err != nil { return 0, err } if ta.Before(tb) { return tb.Sub(ta), nil } else { return ta.Sub(tb), nil } } // LookupCDX asks CDX API. Result will be like: // net,ijmse)/uploadfile/2016/1214/20161214052559646.pdf 20170516210333 // http://www.ijmse.net:80/uploadfile/2016/1214/20161214052559646.pdf // application/pdf 200 PBPHE2OILTB43TAOUO33GBWLE2SS4LQX 2079755 // // Returns a parsed value, but also the raw response body. func LookupCDX(link string) (result CDX, b []byte, err error) { link = prependSchema(link) cdxlink := fmt.Sprintf("%s?url=%s", cdxApi, link) log.Printf("[lookup] %s", cdxlink) req, err := http.NewRequest("GET", cdxlink, nil) if err != nil { return nil, b, err } resp, err := pester.Do(req) if err != nil { return nil, b, err } defer resp.Body.Close() if resp.StatusCode >= 400 { return nil, nil, fmt.Errorf("api returned HTTP %v", resp.StatusCode) } b, err = ioutil.ReadAll(resp.Body) if err != nil { return nil, b, err } result, err = ParseCDX(b) return result, b, err } // ParseCDX parses a CDX line (as coming from the API). func ParseCDX(b []byte) (result CDX, err error) { for _, line := range strings.Split(string(b), "\n") { var fields = strings.Fields(line) if len(fields) == 0 { continue } if len(fields) < 7 { log.Printf("short line: %s", line) continue } cdx := CDXLine{ Surt: fields[0], Date: fields[1], Link: fields[2], ContentType: fields[3], StatusCode: fields[4], Checksum: fields[5], Size: fields[6], } result = append(result, cdx) } return result, nil } func prependSchema(s string) string { if strings.HasPrefix(s, "http") { return s } return fmt.Sprintf("http://%s", s) }