From 52967e05d2c8febdaa0426634fa987eaf5f58577 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 1 Feb 2019 15:13:32 -0800 Subject: give sort way more RAM by default --- notes/crawl_cdx_merge.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'notes/crawl_cdx_merge.md') diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md index 1d744f5..d2cffee 100644 --- a/notes/crawl_cdx_merge.md +++ b/notes/crawl_cdx_merge.md @@ -11,7 +11,7 @@ Run script from scratch repo: Assuming we're just looking at PDFs: - zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -u | gzip > CRAWL-2000.sorted.cdx.gz + zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -S 4G -u | gzip > CRAWL-2000.sorted.cdx.gz ## Old Way @@ -22,7 +22,7 @@ Get all the CDX files and merge/sort: mkdir CRAWL-2000 && cd CRAWL-2000 cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx - sort -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx + sort -S 4G -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx wc -l CRAWL-2000.cdx rm CRAWL-2000.unsorted.cdx -- cgit v1.2.3