cleanup cdx lookup scripts

author: Martin Czygan <martin.czygan@gmail.com> 2021-04-09 01:58:47 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-04-19 20:29:17 +0200
commit: c0e097776dd191ee2f5321239b70a0fb7e9b08ef (patch)
tree: 77fe0a679c8adad78fc40f3d5ff7ace630a66355 /skate/cmd
parent: b0219be1225cfa685d4988be4da4a696ee6188a0 (diff)
download: refcat-c0e097776dd191ee2f5321239b70a0fb7e9b08ef.tar.gz
refcat-c0e097776dd191ee2f5321239b70a0fb7e9b08ef.zip
1 files changed, 0 insertions, 170 deletions
diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go
deleted file mode 100644
index 00f27b5..0000000
--- a/skate/cmd/skate-cdx-lookup/main.go
+++ /dev/null
@@ -1,170 +0,0 @@
-// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try
-// to read from HDFS in parallel and cache some mapping information locally for
-// fast access.
-//
-// What we want: Lookup 10-100M URLs quickly and report, whether the URL is in
-// GWB or not.  Also make this a bit more generic, so we can lookup other
-// things in the CDX index.
-//
-// As of 04/2021 the CDX is split into 300 files, each around 230G, for a total
-// of 70T (compressed, maybe 350T plain). Each file comes with a 90M index
-// containing about 1M lines (with surt, offset, ...).
-//
-// Test run and tiny design:
-//
-// * [ ] accept sorted input only
-// * [ ] get first URL, find the corresponding index file
-//
-// Raw index; only HTTP 200, or redirect; include everything; random URL from a
-// source; popular URL; hundreds of captures; filter the dump! SURT; huge
-// efficiency; PIG;
-// https://git.archive.org/webgroup/sandcrawler/-/tree/master/pig
-//
-// Alternatives: Spark, Sparkling, Pig, Hive, Java MR, ...
-//
-// We take advantage of index files and sorted data. The complete dataset is
-// 66TB, gzip compressed. We do not need compute to be distrubuted, as a single
-// machine may be enough to process the data.
-//
-// An example line:
-//
-// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz
-//
-// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan.
-//
-// The idx files are probably concatenated gzips, otherwise we could not seek into them.
-package main
-
-import (
-	"bufio"
-	"compress/gzip"
-	"flag"
-	"fmt"
-	"io"
-	"log"
-	"sort"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/colinmarc/hdfs"
-)
-
-var (
-	nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
-	cdxDir   = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir")
-
-	note = `
-Make sure HADOOP env is set up.
-
-$ git clone https://git.archive.org/webgroup/hadoop-env.git
-$ source hadoop-env/prod/setup-env.sh
-$ echo $HADOOP_CONF_DIR # should not be empty
-`
-)
-
-func main() {
-	flag.Usage = func() {
-		fmt.Println(note)
-	}
-	flag.Parse()
-	client, err := hdfs.New(*nameNode)
-	if err != nil {
-		log.Fatal(err)
-	}
-	fis, err := client.ReadDir(*cdxDir)
-	if err != nil {
-		log.Fatal(err)
-	}
-	var names []string
-	for _, fi := range fis {
-		names = append(names, fi.Name())
-	}
-	sort.Strings(names)
-	if len(names) == 0 {
-		log.Fatalf("missing files: %s", *cdxDir)
-	}
-	cdxTs := names[0]
-	log.Printf("using %s", cdxTs)
-	// Example seek and read.
-	// /user/wmdata2/cdx-all-index/20210211202455/part-a-00271-idx, 845068 lines, uncompressed
-	// /user/wmdata2/cdx-all-index/20210211202455/part-a-00271.gz, maybe: concatenated gzip
-	f, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271-idx")
-	if err != nil {
-		log.Fatal(err)
-	}
-	defer f.Close()
-	var i int
-	br := bufio.NewReader(f)
-	for {
-		i++
-		line, err := br.ReadString('\n')
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			log.Fatal(err)
-		}
-		indexLine, err := parseIndexLine(line)
-		if err != nil {
-			log.Fatal(err)
-		}
-		if i%25000 == 0 {
-			log.Printf("%d cdx index lines read", i)
-		}
-		if i == 100000 {
-			started := time.Now()
-			// example extraction
-			g, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271.gz")
-			if err != nil {
-				log.Fatal(err)
-			}
-			defer g.Close()
-			_, err = g.Seek(indexLine.Offset, io.SeekStart)
-			if err != nil {
-				log.Fatal(err)
-			}
-			lr := io.LimitReader(g, indexLine.Length)
-			gzr, err := gzip.NewReader(lr)
-			if err != nil {
-				log.Fatal(err)
-			}
-			n, err := io.Copy(io.Discard, gzr)
-			if err != nil {
-				log.Fatal(err)
-			}
-			log.Printf("scanned %d bytes in %v (from slice 100000)", n, time.Since(started))
-		}
-	}
-}
-
-// IndexLine contains CDX index fields.
-type IndexLine struct {
-	Surt   string
-	Date   string
-	Name   string
-	Offset int64
-	Length int64
-}
-
-func parseIndexLine(s string) (*IndexLine, error) {
-	parts := strings.Fields(strings.TrimSpace(s))
-	if len(parts) != 5 {
-		return nil, fmt.Errorf("invalid line: %s", s)
-	}
-	offset, err := strconv.Atoi(parts[3])
-	if err != nil {
-		return nil, fmt.Errorf("cannot parse offset: %v", offset)
-	}
-	length, err := strconv.Atoi(parts[4])
-	if err != nil {
-		return nil, fmt.Errorf("cannot parse length: %v", offset)
-	}
-	return &IndexLine{
-		Surt:   parts[0],
-		Date:   parts[1],
-		Name:   parts[2],
-		Offset: int64(offset),
-		Length: int64(length),
-	}, nil
-}
author	Martin Czygan <martin.czygan@gmail.com>	2021-04-09 01:58:47 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-04-19 20:29:17 +0200
commit	c0e097776dd191ee2f5321239b70a0fb7e9b08ef (patch)
tree	77fe0a679c8adad78fc40f3d5ff7ace630a66355 /skate/cmd
parent	b0219be1225cfa685d4988be4da4a696ee6188a0 (diff)
download	refcat-c0e097776dd191ee2f5321239b70a0fb7e9b08ef.tar.gz refcat-c0e097776dd191ee2f5321239b70a0fb7e9b08ef.zip