diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-04-09 01:58:47 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-04-19 20:29:17 +0200 | 
| commit | c0e097776dd191ee2f5321239b70a0fb7e9b08ef (patch) | |
| tree | 77fe0a679c8adad78fc40f3d5ff7ace630a66355 /skate/cmd | |
| parent | b0219be1225cfa685d4988be4da4a696ee6188a0 (diff) | |
| download | refcat-c0e097776dd191ee2f5321239b70a0fb7e9b08ef.tar.gz refcat-c0e097776dd191ee2f5321239b70a0fb7e9b08ef.zip | |
cleanup cdx lookup scripts
Diffstat (limited to 'skate/cmd')
| -rw-r--r-- | skate/cmd/skate-cdx-lookup/main.go | 170 | 
1 files changed, 0 insertions, 170 deletions
| diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go deleted file mode 100644 index 00f27b5..0000000 --- a/skate/cmd/skate-cdx-lookup/main.go +++ /dev/null @@ -1,170 +0,0 @@ -// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try -// to read from HDFS in parallel and cache some mapping information locally for -// fast access. -// -// What we want: Lookup 10-100M URLs quickly and report, whether the URL is in -// GWB or not.  Also make this a bit more generic, so we can lookup other -// things in the CDX index. -// -// As of 04/2021 the CDX is split into 300 files, each around 230G, for a total -// of 70T (compressed, maybe 350T plain). Each file comes with a 90M index -// containing about 1M lines (with surt, offset, ...). -// -// Test run and tiny design: -// -// * [ ] accept sorted input only -// * [ ] get first URL, find the corresponding index file -// -// Raw index; only HTTP 200, or redirect; include everything; random URL from a -// source; popular URL; hundreds of captures; filter the dump! SURT; huge -// efficiency; PIG; -// https://git.archive.org/webgroup/sandcrawler/-/tree/master/pig -// -// Alternatives: Spark, Sparkling, Pig, Hive, Java MR, ... -// -// We take advantage of index files and sorted data. The complete dataset is -// 66TB, gzip compressed. We do not need compute to be distrubuted, as a single -// machine may be enough to process the data. -// -// An example line: -// -// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz -// -// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan. -// -// The idx files are probably concatenated gzips, otherwise we could not seek into them. -package main - -import ( -	"bufio" -	"compress/gzip" -	"flag" -	"fmt" -	"io" -	"log" -	"sort" -	"strconv" -	"strings" -	"time" - -	"github.com/colinmarc/hdfs" -) - -var ( -	nameNode = flag.String("nn", "", "namenode, leave empty when env is set up") -	cdxDir   = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir") - -	note = ` -Make sure HADOOP env is set up. - -$ git clone https://git.archive.org/webgroup/hadoop-env.git -$ source hadoop-env/prod/setup-env.sh -$ echo $HADOOP_CONF_DIR # should not be empty -` -) - -func main() { -	flag.Usage = func() { -		fmt.Println(note) -	} -	flag.Parse() -	client, err := hdfs.New(*nameNode) -	if err != nil { -		log.Fatal(err) -	} -	fis, err := client.ReadDir(*cdxDir) -	if err != nil { -		log.Fatal(err) -	} -	var names []string -	for _, fi := range fis { -		names = append(names, fi.Name()) -	} -	sort.Strings(names) -	if len(names) == 0 { -		log.Fatalf("missing files: %s", *cdxDir) -	} -	cdxTs := names[0] -	log.Printf("using %s", cdxTs) -	// Example seek and read. -	// /user/wmdata2/cdx-all-index/20210211202455/part-a-00271-idx, 845068 lines, uncompressed -	// /user/wmdata2/cdx-all-index/20210211202455/part-a-00271.gz, maybe: concatenated gzip -	f, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271-idx") -	if err != nil { -		log.Fatal(err) -	} -	defer f.Close() -	var i int -	br := bufio.NewReader(f) -	for { -		i++ -		line, err := br.ReadString('\n') -		if err == io.EOF { -			break -		} -		if err != nil { -			log.Fatal(err) -		} -		indexLine, err := parseIndexLine(line) -		if err != nil { -			log.Fatal(err) -		} -		if i%25000 == 0 { -			log.Printf("%d cdx index lines read", i) -		} -		if i == 100000 { -			started := time.Now() -			// example extraction -			g, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271.gz") -			if err != nil { -				log.Fatal(err) -			} -			defer g.Close() -			_, err = g.Seek(indexLine.Offset, io.SeekStart) -			if err != nil { -				log.Fatal(err) -			} -			lr := io.LimitReader(g, indexLine.Length) -			gzr, err := gzip.NewReader(lr) -			if err != nil { -				log.Fatal(err) -			} -			n, err := io.Copy(io.Discard, gzr) -			if err != nil { -				log.Fatal(err) -			} -			log.Printf("scanned %d bytes in %v (from slice 100000)", n, time.Since(started)) -		} -	} -} - -// IndexLine contains CDX index fields. -type IndexLine struct { -	Surt   string -	Date   string -	Name   string -	Offset int64 -	Length int64 -} - -func parseIndexLine(s string) (*IndexLine, error) { -	parts := strings.Fields(strings.TrimSpace(s)) -	if len(parts) != 5 { -		return nil, fmt.Errorf("invalid line: %s", s) -	} -	offset, err := strconv.Atoi(parts[3]) -	if err != nil { -		return nil, fmt.Errorf("cannot parse offset: %v", offset) -	} -	length, err := strconv.Atoi(parts[4]) -	if err != nil { -		return nil, fmt.Errorf("cannot parse length: %v", offset) -	} -	return &IndexLine{ -		Surt:   parts[0], -		Date:   parts[1], -		Name:   parts[2], -		Offset: int64(offset), -		Length: int64(length), -	}, nil -} | 
