diff options
Diffstat (limited to 'skate/cmd/skate-cdx-lookup')
-rw-r--r-- | skate/cmd/skate-cdx-lookup/main.go | 106 |
1 files changed, 98 insertions, 8 deletions
diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go index 742ca7d..2e43b8a 100644 --- a/skate/cmd/skate-cdx-lookup/main.go +++ b/skate/cmd/skate-cdx-lookup/main.go @@ -1,15 +1,16 @@ // skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try -// to read from HDSFs in parallel and cache some mapping information locally +// to read from HDFS in parallel and cache some mapping information locally // for fast access. // // What we want: Lookup 10-100M URLs and report, whether we have it or not. // Also make this a bit more generic, so we can lookup all kinds of things in // the CDX index. // -// Alternatives: Spark, Sparkling, PIG, Hive, ... +// Alternatives: Spark, Sparkling, Pig, Hive, ... // // We take advantage of index files and sorted data. The complete dataset is -// 66TB, gzip compressed. +// 66TB, gzip compressed. We do not need compute to be distrubuted, as a single +// machine may be enough to process the data. // // An example line: // @@ -17,13 +18,20 @@ // // The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan. // +// The idx files are probably concatenated gzips, otherwise we could not seek into them. package main import ( + "bufio" + "compress/gzip" "flag" "fmt" + "io" "log" "sort" + "strconv" + "strings" + "time" "github.com/colinmarc/hdfs" ) @@ -31,18 +39,19 @@ import ( var ( nameNode = flag.String("nn", "", "namenode, leave empty when env is set up") cdxDir = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir") -) -func main() { - flag.Usage = func() { - fmt.Println(` + note = ` Make sure HADOOP env is set up. $ git clone https://git.archive.org/webgroup/hadoop-env.git $ source hadoop-env/prod/setup-env.sh $ echo $HADOOP_CONF_DIR # should not be empty +` +) -`) +func main() { + flag.Usage = func() { + fmt.Println(note) } flag.Parse() client, err := hdfs.New(*nameNode) @@ -63,4 +72,85 @@ $ echo $HADOOP_CONF_DIR # should not be empty } cdxTs := names[0] log.Printf("using %s", cdxTs) + // Example seek and read. + // /user/wmdata2/cdx-all-index/20210211202455/part-a-00271-idx, 845068 lines, uncompressed + // /user/wmdata2/cdx-all-index/20210211202455/part-a-00271.gz, maybe: concatenated gzip + f, err := client.Open("/user/wmdata2/cdx-all-index/20210211202455/part-a-00271-idx") + if err != nil { + log.Fatal(err) + } + defer f.Close() + var i int + br := bufio.NewReader(f) + for { + i++ + line, err := br.ReadString('\n') + if err == io.EOF { + break + } + if err != nil { + log.Fatal(err) + } + indexLine, err := parseIndexLine(line) + if err != nil { + log.Fatal(err) + } + if i%25000 == 0 { + log.Printf("%d cdx index lines read", i) + } + if i == 100000 { + started := time.Now() + // example extraction + g, err := client.Open("/user/wmdata2/cdx-all-index/20210211202455/part-a-00271.gz") + if err != nil { + log.Fatal(err) + } + defer g.Close() + _, err = g.Seek(indexLine.Offset, io.SeekStart) + if err != nil { + log.Fatal(err) + } + lr := io.LimitReader(g, indexLine.Length) + gzr, err := gzip.NewReader(lr) + if err != nil { + log.Fatal(err) + } + n, err := io.Copy(io.Discard, gzr) + if err != nil { + log.Fatal(err) + } + log.Printf("scanned %d bytes in %v (from slice 100000)", n, time.Since(started)) + } + } +} + +// IndexLine contains CDX index fields. +type IndexLine struct { + Surt string + Date string + Name string + Offset int64 + Length int64 +} + +func parseIndexLine(s string) (*IndexLine, error) { + parts := strings.Fields(strings.TrimSpace(s)) + if len(parts) != 5 { + return nil, fmt.Errorf("invalid line: %s", s) + } + offset, err := strconv.Atoi(parts[3]) + if err != nil { + return nil, fmt.Errorf("cannot parse offset: %v", offset) + } + length, err := strconv.Atoi(parts[4]) + if err != nil { + return nil, fmt.Errorf("cannot parse length: %v", offset) + } + return &IndexLine{ + Surt: parts[0], + Date: parts[1], + Name: parts[2], + Offset: int64(offset), + Length: int64(length), + }, nil } |