From bf5ffca07b4dfbd9b1134e2a0223c0c7d27b49d9 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 8 Apr 2021 15:56:23 +0200 Subject: cdx lookup stub --- skate/.gitignore | 1 + skate/cmd/skate-cdx-lookup/main.go | 40 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/skate/.gitignore b/skate/.gitignore index 4e893a0..698a39a 100644 --- a/skate/.gitignore +++ b/skate/.gitignore @@ -22,6 +22,7 @@ /skate-bref-id /skate-from-unstructured /skate-wikipedia-doi +/skate-cdx-lookup packaging/debian/skate/usr skate_*_amd64.deb diff --git a/skate/cmd/skate-cdx-lookup/main.go b/skate/cmd/skate-cdx-lookup/main.go index 1989536..742ca7d 100644 --- a/skate/cmd/skate-cdx-lookup/main.go +++ b/skate/cmd/skate-cdx-lookup/main.go @@ -1,14 +1,37 @@ +// skate-cdx-lookup is a lookup tool for small and large lists of URLs. We try +// to read from HDSFs in parallel and cache some mapping information locally +// for fast access. +// +// What we want: Lookup 10-100M URLs and report, whether we have it or not. +// Also make this a bit more generic, so we can lookup all kinds of things in +// the CDX index. +// +// Alternatives: Spark, Sparkling, PIG, Hive, ... +// +// We take advantage of index files and sorted data. The complete dataset is +// 66TB, gzip compressed. +// +// An example line: +// +// org,rdnn,software,gps)/he.jpg 20050412144213 http://www.gps.software.rdnn.org:80/He.JPG image/jpeg 200 VSJNO26E43GP7OYL6BIRE4IXSIOMHZA5 - - 3943 43865977 ED_crawl28.20050412080854-c/ED_crawl28.20050412144103.arc.gz +// +// The index files are named part-a-00276-idx and are typically around 100M, not compressed. 900K lines, takes 1-2s to scan. +// package main import ( "flag" "fmt" "log" + "sort" "github.com/colinmarc/hdfs" ) -var nameNode = flag.String("nn", "", "namenode, leave empty when env is set up") +var ( + nameNode = flag.String("nn", "", "namenode, leave empty when env is set up") + cdxDir = flag.String("C", "/user/wmdata2/cdx-all-index", "cdx dir") +) func main() { flag.Usage = func() { @@ -26,5 +49,18 @@ $ echo $HADOOP_CONF_DIR # should not be empty if err != nil { log.Fatal(err) } - log.Println(client) + fis, err := client.ReadDir(*cdxDir) + if err != nil { + log.Fatal(err) + } + var names []string + for _, fi := range fis { + names = append(names, fi.Name()) + } + sort.Strings(names) + if len(names) == 0 { + log.Fatalf("missing files: %s", *cdxDir) + } + cdxTs := names[0] + log.Printf("using %s", cdxTs) } -- cgit v1.2.3