From 1ae7fd2f0c5661560b15be86614c2c4d41b21205 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 24 Aug 2018 13:39:02 -0700 Subject: commit notes from my laptop --- notes/crawl_cdx_merge.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 notes/crawl_cdx_merge.md (limited to 'notes/crawl_cdx_merge.md') diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md new file mode 100644 index 0000000..a843a8d --- /dev/null +++ b/notes/crawl_cdx_merge.md @@ -0,0 +1,16 @@ + +## Old Way + + +Use metamgr to export an items list. + +Get all the CDX files and merge/sort: + + mkdir CRAWL-2000 && cd CRAWL-2000 + cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz + ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx + sort -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx + wc -l CRAWL-2000.cdx + rm CRAWL-2000.unsorted.cdx + + # gzip and upload to petabox, or send to HDFS, or whatever -- cgit v1.2.3