diff options
-rw-r--r-- | skate/.gitignore | 1 | ||||
-rw-r--r-- | skate/Makefile | 2 | ||||
-rw-r--r-- | skate/cmd/skate-cdx-lookup/main.go | 170 |
3 files changed, 1 insertions, 172 deletions
diff --git a/skate/.gitignore b/skate/.gitignore index 698a39a..4e893a0 100644 --- a/skate/.gitignore +++ b/skate/.gitignore @@ -22,7 +22,6 @@ /skate-bref-id /skate-from-unstructured /skate-wikipedia-doi -/skate-cdx-lookup packaging/debian/skate/usr skate_*_amd64.deb diff --git a/skate/Makefile b/skate/Makefile index 386781d..ccaf08e 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-cdx-lookup +TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi PKGNAME := skate .PHONY: test diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go deleted file mode 100644 index 00f27b5..0000000 --- a/skate/cmd/skate-cdx-lookup/main.go +++ /dev/null @@ -1,170 +0,0 @@ -// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try -// to read from HDFS in parallel and cache some mapping information locally for -// fast access. -// -// What we want: Lookup 10-100M URLs quickly and report, whether the URL is in -// GWB or not. Also make this a bit more generic, so we can lookup other -// things in the CDX index. -// -// As of 04/2021 the CDX is split into 300 files, each around 230G, for a total -// of 70T (compressed, maybe 350T plain). Each file comes with a 90M index -// containing about 1M lines (with surt, offset, ...). -// -// Test run and tiny design: -// -// * [ ] accept sorted input only -// * [ ] get first URL, find the corresponding index file -// -// Raw index; only HTTP 200, or redirect; include everything; random URL from a -// source; popular URL; hundreds of captures; filter the dump! SURT; huge -// efficiency; PIG; -// https://git.archive.org/webgroup/sandcrawler/-/tree/master/pig -// -// Alternatives: Spark, Sparkling, Pig, Hive, Java MR, ... -// -// We take advantage of index files and sorted data. The complete dataset is -// 66TB, gzip compressed. We do not need compute to be distrubuted, as a single -// machine may be enough to process the data. -// -// An example line: -// -// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz -// -// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan. -// -// The idx files are probably concatenated gzips, otherwise we could not seek into them. -package main - -import ( - "bufio" - "compress/gzip" - "flag" - "fmt" - "io" - "log" - "sort" - "strconv" - "strings" - "time" - - "github.com/colinmarc/hdfs" -) - -var ( - nameNode = flag.String("nn", "", "namenode, leave empty when env is set up") - cdxDir = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir") - - note = ` -Make sure HADOOP env is set up. - -$ git clone https://git.archive.org/webgroup/hadoop-env.git -$ source hadoop-env/prod/setup-env.sh -$ echo $HADOOP_CONF_DIR # should not be empty -` -) - -func main() { - flag.Usage = func() { - fmt.Println(note) - } - flag.Parse() - client, err := hdfs.New(*nameNode) - if err != nil { - log.Fatal(err) - } - fis, err := client.ReadDir(*cdxDir) - if err != nil { - log.Fatal(err) - } - var names []string - for _, fi := range fis { - names = append(names, fi.Name()) - } - sort.Strings(names) - if len(names) == 0 { - log.Fatalf("missing files: %s", *cdxDir) - } - cdxTs := names[0] - log.Printf("using %s", cdxTs) - // Example seek and read. - // /user/wmdata2/cdx-all-index/20210211202455/part-a-00271-idx, 845068 lines, uncompressed - // /user/wmdata2/cdx-all-index/20210211202455/part-a-00271.gz, maybe: concatenated gzip - f, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271-idx") - if err != nil { - log.Fatal(err) - } - defer f.Close() - var i int - br := bufio.NewReader(f) - for { - i++ - line, err := br.ReadString('\n') - if err == io.EOF { - break - } - if err != nil { - log.Fatal(err) - } - indexLine, err := parseIndexLine(line) - if err != nil { - log.Fatal(err) - } - if i%25000 == 0 { - log.Printf("%d cdx index lines read", i) - } - if i == 100000 { - started := time.Now() - // example extraction - g, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271.gz") - if err != nil { - log.Fatal(err) - } - defer g.Close() - _, err = g.Seek(indexLine.Offset, io.SeekStart) - if err != nil { - log.Fatal(err) - } - lr := io.LimitReader(g, indexLine.Length) - gzr, err := gzip.NewReader(lr) - if err != nil { - log.Fatal(err) - } - n, err := io.Copy(io.Discard, gzr) - if err != nil { - log.Fatal(err) - } - log.Printf("scanned %d bytes in %v (from slice 100000)", n, time.Since(started)) - } - } -} - -// IndexLine contains CDX index fields. -type IndexLine struct { - Surt string - Date string - Name string - Offset int64 - Length int64 -} - -func parseIndexLine(s string) (*IndexLine, error) { - parts := strings.Fields(strings.TrimSpace(s)) - if len(parts) != 5 { - return nil, fmt.Errorf("invalid line: %s", s) - } - offset, err := strconv.Atoi(parts[3]) - if err != nil { - return nil, fmt.Errorf("cannot parse offset: %v", offset) - } - length, err := strconv.Atoi(parts[4]) - if err != nil { - return nil, fmt.Errorf("cannot parse length: %v", offset) - } - return &IndexLine{ - Surt: parts[0], - Date: parts[1], - Name: parts[2], - Offset: int64(offset), - Length: int64(length), - }, nil -} |