aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-08 15:56:23 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-19 20:29:17 +0200
commitbf5ffca07b4dfbd9b1134e2a0223c0c7d27b49d9 (patch)
treedc0304f6ab48d75d722cc1b255fc60134caf3a82 /skate/cmd
parent03a0c9d734c36117aa3b6e7ed52d88c143795904 (diff)
downloadrefcat-bf5ffca07b4dfbd9b1134e2a0223c0c7d27b49d9.tar.gz
refcat-bf5ffca07b4dfbd9b1134e2a0223c0c7d27b49d9.zip
cdx lookup stub
Diffstat (limited to 'skate/cmd')
-rw-r--r--skate/cmd/skate-cdx-lookup/main.go40
1 files changed, 38 insertions, 2 deletions
diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go
index 1989536..742ca7d 100644
--- a/skate/cmd/skate-cdx-lookup/main.go
+++ b/skate/cmd/skate-cdx-lookup/main.go
@@ -1,14 +1,37 @@
+// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try
+// to read from HDSFs in parallel and cache some mapping information locally
+// for fast access.
+//
+// What we want: Lookup 10-100M URLs and report, whether we have it or not.
+// Also make this a bit more generic, so we can lookup all kinds of things in
+// the CDX index.
+//
+// Alternatives: Spark, Sparkling, PIG, Hive, ...
+//
+// We take advantage of index files and sorted data. The complete dataset is
+// 66TB, gzip compressed.
+//
+// An example line:
+//
+// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz
+//
+// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan.
+//
package main
import (
"flag"
"fmt"
"log"
+ "sort"
"github.com/colinmarc/hdfs"
)
-var nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
+var (
+ nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
+ cdxDir = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir")
+)
func main() {
flag.Usage = func() {
@@ -26,5 +49,18 @@ $ echo $HADOOP_CONF_DIR # should not be empty
if err != nil {
log.Fatal(err)
}
- log.Println(client)
+ fis, err := client.ReadDir(*cdxDir)
+ if err != nil {
+ log.Fatal(err)
+ }
+ var names []string
+ for _, fi := range fis {
+ names = append(names, fi.Name())
+ }
+ sort.Strings(names)
+ if len(names) == 0 {
+ log.Fatalf("missing files: %s", *cdxDir)
+ }
+ cdxTs := names[0]
+ log.Printf("using %s", cdxTs)
}