package skate import ( "fmt" "io/ioutil" "log" "net/http" "strings" "github.com/sethgrid/pester" ) const cdxApi = "http://web.archive.org/cdx/search/cdx" // CDX line, might add more fields later. type CDX struct { Surt string Date string Link string ContentType string StatusCode string Checksum string Size string } // LookupCDX asks CDX API. Result will be like: // net,ijmse)/uploadfile/2016/1214/20161214052559646.pdf 20170516210333 // http://www.ijmse.net:80/uploadfile/2016/1214/20161214052559646.pdf // application/pdf 200 PBPHE2OILTB43TAOUO33GBWLE2SS4LQX 2079755 // // Also returns the raw response body. func LookupCDX(link string) (result []CDX, b []byte, err error) { link = prependSchema(link) cdxlink := fmt.Sprintf("%s?url=%s", cdxApi, link) log.Printf("[lookup] %s", cdxlink) req, err := http.NewRequest("GET", cdxlink, nil) if err != nil { return nil, b, err } resp, err := pester.Do(req) if err != nil { return nil, b, err } defer resp.Body.Close() b, err = ioutil.ReadAll(resp.Body) if err != nil { return nil, b, err } result, err = ParseCDX(b) return result, b, err } func ParseCDX(b []byte) (result []CDX, err error) { for _, line := range strings.Split(string(b), "\n") { var fields = strings.Fields(line) if len(fields) == 0 { continue } if len(fields) < 7 { log.Printf("short line: %s", line) continue } cdx := CDX{ Surt: fields[0], Date: fields[1], Link: fields[2], ContentType: fields[3], StatusCode: fields[4], Checksum: fields[5], Size: fields[6], } result = append(result, cdx) } return result, nil } func prependSchema(s string) string { if strings.HasPrefix(s, "http") { return s } return fmt.Sprintf("http://%s", s) }