aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/.gitignore1
-rw-r--r--skate/Makefile2
-rw-r--r--skate/cmd/skate-cdx-lookup/main.go170
3 files changed, 1 insertions, 172 deletions
diff --git a/skate/.gitignore b/skate/.gitignore
index 698a39a..4e893a0 100644
--- a/skate/.gitignore
+++ b/skate/.gitignore
@@ -22,7 +22,6 @@
/skate-bref-id
/skate-from-unstructured
/skate-wikipedia-doi
-/skate-cdx-lookup
packaging/debian/skate/usr
skate_*_amd64.deb
diff --git a/skate/Makefile b/skate/Makefile
index 386781d..ccaf08e 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-cdx-lookup
+TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi
PKGNAME := skate
.PHONY: test
diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go
deleted file mode 100644
index 00f27b5..0000000
--- a/skate/cmd/skate-cdx-lookup/main.go
+++ /dev/null
@@ -1,170 +0,0 @@
-// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try
-// to read from HDFS in parallel and cache some mapping information locally for
-// fast access.
-//
-// What we want: Lookup 10-100M URLs quickly and report, whether the URL is in
-// GWB or not. Also make this a bit more generic, so we can lookup other
-// things in the CDX index.
-//
-// As of 04/2021 the CDX is split into 300 files, each around 230G, for a total
-// of 70T (compressed, maybe 350T plain). Each file comes with a 90M index
-// containing about 1M lines (with surt, offset, ...).
-//
-// Test run and tiny design:
-//
-// * [ ] accept sorted input only
-// * [ ] get first URL, find the corresponding index file
-//
-// Raw index; only HTTP 200, or redirect; include everything; random URL from a
-// source; popular URL; hundreds of captures; filter the dump! SURT; huge
-// efficiency; PIG;
-// https://git.archive.org/webgroup/sandcrawler/-/tree/master/pig
-//
-// Alternatives: Spark, Sparkling, Pig, Hive, Java MR, ...
-//
-// We take advantage of index files and sorted data. The complete dataset is
-// 66TB, gzip compressed. We do not need compute to be distrubuted, as a single
-// machine may be enough to process the data.
-//
-// An example line:
-//
-// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz
-//
-// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan.
-//
-// The idx files are probably concatenated gzips, otherwise we could not seek into them.
-package main
-
-import (
- "bufio"
- "compress/gzip"
- "flag"
- "fmt"
- "io"
- "log"
- "sort"
- "strconv"
- "strings"
- "time"
-
- "github.com/colinmarc/hdfs"
-)
-
-var (
- nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
- cdxDir = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir")
-
- note = `
-Make sure HADOOP env is set up.
-
-$ git clone https://git.archive.org/webgroup/hadoop-env.git
-$ source hadoop-env/prod/setup-env.sh
-$ echo $HADOOP_CONF_DIR # should not be empty
-`
-)
-
-func main() {
- flag.Usage = func() {
- fmt.Println(note)
- }
- flag.Parse()
- client, err := hdfs.New(*nameNode)
- if err != nil {
- log.Fatal(err)
- }
- fis, err := client.ReadDir(*cdxDir)
- if err != nil {
- log.Fatal(err)
- }
- var names []string
- for _, fi := range fis {
- names = append(names, fi.Name())
- }
- sort.Strings(names)
- if len(names) == 0 {
- log.Fatalf("missing files: %s", *cdxDir)
- }
- cdxTs := names[0]
- log.Printf("using %s", cdxTs)
- // Example seek and read.
- // /user/wmdata2/cdx-all-index/20210211202455/part-a-00271-idx, 845068 lines, uncompressed
- // /user/wmdata2/cdx-all-index/20210211202455/part-a-00271.gz, maybe: concatenated gzip
- f, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271-idx")
- if err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- var i int
- br := bufio.NewReader(f)
- for {
- i++
- line, err := br.ReadString('\n')
- if err == io.EOF {
- break
- }
- if err != nil {
- log.Fatal(err)
- }
- indexLine, err := parseIndexLine(line)
- if err != nil {
- log.Fatal(err)
- }
- if i%25000 == 0 {
- log.Printf("%d cdx index lines read", i)
- }
- if i == 100000 {
- started := time.Now()
- // example extraction
- g, err := client.Open("/user/wmdata2/cdx-all-index/" + cdxTs + "/part-a-00271.gz")
- if err != nil {
- log.Fatal(err)
- }
- defer g.Close()
- _, err = g.Seek(indexLine.Offset, io.SeekStart)
- if err != nil {
- log.Fatal(err)
- }
- lr := io.LimitReader(g, indexLine.Length)
- gzr, err := gzip.NewReader(lr)
- if err != nil {
- log.Fatal(err)
- }
- n, err := io.Copy(io.Discard, gzr)
- if err != nil {
- log.Fatal(err)
- }
- log.Printf("scanned %d bytes in %v (from slice 100000)", n, time.Since(started))
- }
- }
-}
-
-// IndexLine contains CDX index fields.
-type IndexLine struct {
- Surt string
- Date string
- Name string
- Offset int64
- Length int64
-}
-
-func parseIndexLine(s string) (*IndexLine, error) {
- parts := strings.Fields(strings.TrimSpace(s))
- if len(parts) != 5 {
- return nil, fmt.Errorf("invalid line: %s", s)
- }
- offset, err := strconv.Atoi(parts[3])
- if err != nil {
- return nil, fmt.Errorf("cannot parse offset: %v", offset)
- }
- length, err := strconv.Atoi(parts[4])
- if err != nil {
- return nil, fmt.Errorf("cannot parse length: %v", offset)
- }
- return &IndexLine{
- Surt: parts[0],
- Date: parts[1],
- Name: parts[2],
- Offset: int64(offset),
- Length: int64(length),
- }, nil
-}