skate/cmd/skate-cdx-lookup/main.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

// skate-cdx-lookup is a lookup tool for small and large lists of URLs.  We try
// to read from HDSFs in parallel and cache some mapping information locally
// for fast access.
//
// What we want: Lookup 10-100M URLs and report, whether we have it or not.
// Also make this a bit more generic, so we can lookup all kinds of things in
// the CDX index.
//
// Alternatives: Spark, Sparkling, PIG, Hive, ...
//
// We take advantage of index files and sorted data. The complete dataset is
// 66TB, gzip compressed.
//
// An example line:
//
// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz
//
// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan.
//
package main

import (
	"flag"
	"fmt"
	"log"
	"sort"

	"github.com/colinmarc/hdfs"
)

var (
	nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
	cdxDir   = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir")
)

func main() {
	flag.Usage = func() {
		fmt.Println(`
Make sure HADOOP env is set up.

$ git clone https://git.archive.org/webgroup/hadoop-env.git
$ source hadoop-env/prod/setup-env.sh
$ echo $HADOOP_CONF_DIR # should not be empty

`)
	}
	flag.Parse()
	client, err := hdfs.New(*nameNode)
	if err != nil {
		log.Fatal(err)
	}
	fis, err := client.ReadDir(*cdxDir)
	if err != nil {
		log.Fatal(err)
	}
	var names []string
	for _, fi := range fis {
		names = append(names, fi.Name())
	}
	sort.Strings(names)
	if len(names) == 0 {
		log.Fatalf("missing files: %s", *cdxDir)
	}
	cdxTs := names[0]
	log.Printf("using %s", cdxTs)
}