1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try
// to read from HDSFs in parallel and cache some mapping information locally
// for fast access.
//
// What we want: Lookup 10-100M URLs and report, whether we have it or not.
// Also make this a bit more generic, so we can lookup all kinds of things in
// the CDX index.
//
// Alternatives: Spark, Sparkling, PIG, Hive, ...
//
// We take advantage of index files and sorted data. The complete dataset is
// 66TB, gzip compressed.
//
// An example line:
//
// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz
//
// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan.
//
package main
import (
"flag"
"fmt"
"log"
"sort"
"github.com/colinmarc/hdfs"
)
var (
nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
cdxDir = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir")
)
func main() {
flag.Usage = func() {
fmt.Println(`
Make sure HADOOP env is set up.
$ git clone https://git.archive.org/webgroup/hadoop-env.git
$ source hadoop-env/prod/setup-env.sh
$ echo $HADOOP_CONF_DIR # should not be empty
`)
}
flag.Parse()
client, err := hdfs.New(*nameNode)
if err != nil {
log.Fatal(err)
}
fis, err := client.ReadDir(*cdxDir)
if err != nil {
log.Fatal(err)
}
var names []string
for _, fi := range fis {
names = append(names, fi.Name())
}
sort.Strings(names)
if len(names) == 0 {
log.Fatalf("missing files: %s", *cdxDir)
}
cdxTs := names[0]
log.Printf("using %s", cdxTs)
}
|