diff options
Diffstat (limited to 'notes/crawl_cdx_merge.md')
-rw-r--r-- | notes/crawl_cdx_merge.md | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md new file mode 100644 index 0000000..d330e9b --- /dev/null +++ b/notes/crawl_cdx_merge.md @@ -0,0 +1,29 @@ + +## New Way + +Run script from scratch repo: + + ~/scratch/bin/cdx_collection.py CRAWL-2000 + + zcat CRAWL-2000.cdx.gz | wc -l + + # update crawl README/ANALYSIS/whatever + +Assuming we're just looking at PDFs: + + zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -S 4G -u > CRAWL-2000.sorted.cdx + +## Old Way + +Use metamgr to export an items list. + +Get all the CDX files and merge/sort: + + mkdir CRAWL-2000 && cd CRAWL-2000 + cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz + ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx + sort -S 4G -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx + wc -l CRAWL-2000.cdx + rm CRAWL-2000.unsorted.cdx + + # gzip and upload to petabox, or send to HDFS, or whatever |