diff options
Diffstat (limited to 'notes/crawl_cdx_merge.md')
-rw-r--r-- | notes/crawl_cdx_merge.md | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md new file mode 100644 index 0000000..a843a8d --- /dev/null +++ b/notes/crawl_cdx_merge.md @@ -0,0 +1,16 @@ + +## Old Way + + +Use metamgr to export an items list. + +Get all the CDX files and merge/sort: + + mkdir CRAWL-2000 && cd CRAWL-2000 + cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz + ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx + sort -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx + wc -l CRAWL-2000.cdx + rm CRAWL-2000.unsorted.cdx + + # gzip and upload to petabox, or send to HDFS, or whatever |