From 7463049b621f7729b48c5e06429767118c1b8506 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 1 Feb 2019 15:10:34 -0800 Subject: update dump and sort commands Pipeline sorts are *so* starved and slow ; they only get a few MByte of RAM by default! --- extra/partition_dumps/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'extra/partition_dumps/README.md') diff --git a/extra/partition_dumps/README.md b/extra/partition_dumps/README.md index 5e42ff48..463bf42d 100644 --- a/extra/partition_dumps/README.md +++ b/extra/partition_dumps/README.md @@ -6,8 +6,8 @@ journal/container. Example parititoning a sample by release type: cat release_export_expanded_sample.json | jq .release_type -r > release_export_expanded_sample.release_type - cat release_export_expanded_sample.release_type | sort | uniq -c | sort -nr > release_export_expanded_sample.release_type.counts - cat release_export_expanded_sample.json | paste release_export_expanded_sample.release_type - | sort > out + cat release_export_expanded_sample.release_type | sort -S 4G | uniq -c | sort -S 500M -nr > release_export_expanded_sample.release_type.counts + cat release_export_expanded_sample.json | paste release_export_expanded_sample.release_type - | sort -S 4G > out More production-y example using ISSN-L: @@ -16,10 +16,10 @@ More production-y example using ISSN-L: # it's a pretty huge sort, will need 300+ GB scratch space? this might not scale. zcat release_export_expanded.json.gz | jq .container.issnl -r > release_export_expanded.issnl - zcat release_export_expanded.json.gz | paste release_export_expanded.issnl - | sort | ./partition_script.py + zcat release_export_expanded.json.gz | paste release_export_expanded.issnl - | sort -S 8G | ./partition_script.py # for verification/stats - cat release_export_expanded.issnl | sort | uniq -c | sort -nr > release_export_expanded.issnl.counts + cat release_export_expanded.issnl | sort -S 1G | uniq -c | sort -S 1G -nr > release_export_expanded.issnl.counts # cleanup rm release_export_expanded.issnl -- cgit v1.2.3