aboutsummaryrefslogtreecommitdiffstats
path: root/notes/crawl_cdx_merge.md
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-02-01 15:13:32 -0800
committerBryan Newbold <bnewbold@archive.org>2019-02-01 15:13:32 -0800
commit52967e05d2c8febdaa0426634fa987eaf5f58577 (patch)
treeda12fd9c2f1ea3d517246a60dbc1467eb0ad748f /notes/crawl_cdx_merge.md
parent8901138485d1da4eb9a2512268faaa27fdf567c5 (diff)
downloadsandcrawler-52967e05d2c8febdaa0426634fa987eaf5f58577.tar.gz
sandcrawler-52967e05d2c8febdaa0426634fa987eaf5f58577.zip
give sort way more RAM by default
Diffstat (limited to 'notes/crawl_cdx_merge.md')
-rw-r--r--notes/crawl_cdx_merge.md4
1 files changed, 2 insertions, 2 deletions
diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md
index 1d744f5..d2cffee 100644
--- a/notes/crawl_cdx_merge.md
+++ b/notes/crawl_cdx_merge.md
@@ -11,7 +11,7 @@ Run script from scratch repo:
Assuming we're just looking at PDFs:
- zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -u | gzip > CRAWL-2000.sorted.cdx.gz
+ zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -S 4G -u | gzip > CRAWL-2000.sorted.cdx.gz
## Old Way
@@ -22,7 +22,7 @@ Get all the CDX files and merge/sort:
mkdir CRAWL-2000 && cd CRAWL-2000
cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz
ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx
- sort -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx
+ sort -S 4G -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx
wc -l CRAWL-2000.cdx
rm CRAWL-2000.unsorted.cdx