diff options
-rw-r--r-- | skate/cdx.go | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/skate/cdx.go b/skate/cdx.go new file mode 100644 index 0000000..c618c97 --- /dev/null +++ b/skate/cdx.go @@ -0,0 +1,74 @@ +package skate + +import ( + "fmt" + "io/ioutil" + "log" + "net/http" + "strings" + + "github.com/sethgrid/pester" +) + +const cdxApi = "http://web.archive.org/cdx/search/cdx" + +// CDX line, might add more fields later. +type CDX struct { + Surt string + Date string + Link string + ContentType string + StatusCode string + Checksum string + Size string +} + +// LookupCDX asks CDX API. Result will be like: +// net,ijmse)/uploadfile/2016/1214/20161214052559646.pdf 20170516210333 +// http://www.ijmse.net:80/uploadfile/2016/1214/20161214052559646.pdf +// application/pdf 200 PBPHE2OILTB43TAOUO33GBWLE2SS4LQX 2079755 +func LookupCDX(link string) (result []CDX, err error) { + link = prependSchema(link) + cdxlink := fmt.Sprintf("%s?url=%s", cdxApi, link) + log.Printf("[lookup] %s", cdxlink) + req, err := http.NewRequest("GET", cdxlink, nil) + if err != nil { + return nil, err + } + resp, err := pester.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, err + } + for _, line := range strings.Split(string(b), "\n") { + var fields = strings.Fields(line) + if len(fields) == 0 { + continue + } + if len(fields) < 7 { + log.Printf("short line: %s", line) + } + cdx := CDX{ + Surt: fields[0], + Date: fields[1], + Link: fields[2], + ContentType: fields[3], + StatusCode: fields[4], + Checksum: fields[5], + Size: fields[6], + } + result = append(result, cdx) + } + return result, nil +} + +func prependSchema(s string) string { + if strings.HasPrefix(s, "http") { + return s + } + return fmt.Sprintf("http://%s", s) +} |