aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/.gitignore1
-rw-r--r--skate/cmd/skate-cdx-lookup/main.go40
2 files changed, 39 insertions, 2 deletions
diff --git a/skate/.gitignore b/skate/.gitignore
index 4e893a0..698a39a 100644
--- a/skate/.gitignore
+++ b/skate/.gitignore
@@ -22,6 +22,7 @@
/skate-bref-id
/skate-from-unstructured
/skate-wikipedia-doi
+/skate-cdx-lookup
packaging/debian/skate/usr
skate_*_amd64.deb
diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go
index 1989536..742ca7d 100644
--- a/skate/cmd/skate-cdx-lookup/main.go
+++ b/skate/cmd/skate-cdx-lookup/main.go
@@ -1,14 +1,37 @@
+// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try
+// to read from HDSFs in parallel and cache some mapping information locally
+// for fast access.
+//
+// What we want: Lookup 10-100M URLs and report, whether we have it or not.
+// Also make this a bit more generic, so we can lookup all kinds of things in
+// the CDX index.
+//
+// Alternatives: Spark, Sparkling, PIG, Hive, ...
+//
+// We take advantage of index files and sorted data. The complete dataset is
+// 66TB, gzip compressed.
+//
+// An example line:
+//
+// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz
+//
+// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan.
+//
package main
import (
"flag"
"fmt"
"log"
+ "sort"
"github.com/colinmarc/hdfs"
)
-var nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
+var (
+ nameNode = flag.String("nn", "", "namenode, leave empty when env is set up")
+ cdxDir = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir")
+)
func main() {
flag.Usage = func() {
@@ -26,5 +49,18 @@ $ echo $HADOOP_CONF_DIR # should not be empty
if err != nil {
log.Fatal(err)
}
- log.Println(client)
+ fis, err := client.ReadDir(*cdxDir)
+ if err != nil {
+ log.Fatal(err)
+ }
+ var names []string
+ for _, fi := range fis {
+ names = append(names, fi.Name())
+ }
+ sort.Strings(names)
+ if len(names) == 0 {
+ log.Fatalf("missing files: %s", *cdxDir)
+ }
+ cdxTs := names[0]
+ log.Printf("using %s", cdxTs)
}