give sort way more RAM by default

author: Bryan Newbold <bnewbold@archive.org> 2019-02-01 15:13:32 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2019-02-01 15:13:32 -0800
commit: 52967e05d2c8febdaa0426634fa987eaf5f58577 (patch)
tree: da12fd9c2f1ea3d517246a60dbc1467eb0ad748f /notes
parent: 8901138485d1da4eb9a2512268faaa27fdf567c5 (diff)
download: sandcrawler-52967e05d2c8febdaa0426634fa987eaf5f58577.tar.gz
sandcrawler-52967e05d2c8febdaa0426634fa987eaf5f58577.zip
3 files changed, 6 insertions, 6 deletions
diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md
index 1d744f5..d2cffee 100644
--- a/notes/crawl_cdx_merge.md
+++ b/notes/crawl_cdx_merge.md
@@ -11,7 +11,7 @@ Run script from scratch repo:
 
 Assuming we're just looking at PDFs:
 
-    zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -u | gzip > CRAWL-2000.sorted.cdx.gz
+    zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -S 4G -u | gzip > CRAWL-2000.sorted.cdx.gz
 
 ## Old Way
 
@@ -22,7 +22,7 @@ Get all the CDX files and merge/sort:
     mkdir CRAWL-2000 && cd CRAWL-2000
     cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz
     ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx
-    sort -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx
+    sort -S 4G -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx
     wc -l CRAWL-2000.cdx
     rm CRAWL-2000.unsorted.cdx
 
diff --git a/notes/match_filter_enrich.txt b/notes/match_filter_enrich.txt
index 0c9a2c3..0c1f7df 100644
--- a/notes/match_filter_enrich.txt
+++ b/notes/match_filter_enrich.txt
@@ -9,7 +9,7 @@ somewhere.
 
 Reduce down the scored matches to just {sha1, dois}, sorted:
 
-    zcat 2018-08-27-2352.17-matchcrossref.tsv.gz | ./filter_scored_matches.py | pv -l | sort > 2018-08-27-2352.17-matchcrossref.filtered.tsv
+    zcat 2018-08-27-2352.17-matchcrossref.tsv.gz | ./filter_scored_matches.py | pv -l | sort -S 8G > 2018-08-27-2352.17-matchcrossref.filtered.tsv
     # 5.79M 0:18:54 [5.11k/s]
 
 Join/merge the output:
@@ -25,7 +25,7 @@ json2} columns from the regular match script. The filter_scored_matches.py
 doesn't know what to do with those columns at the moment, and the output isn't
 sorted by slug... need to tweak scripts to fix this.
 
-In the meanwhile, as a work around just take the columns we want and resort:
+In the meanwhile, as a work around just take the columns we want and re-sort:
 
     export LC_ALL=C
-    zcat 2018-12-18-2237.09-matchcrossref.insertable.tsv.gz | cut -f2-5 | sort -u | gzip > 2018-12-18-2237.09-matchcrossref.tsv.gz
+    zcat 2018-12-18-2237.09-matchcrossref.insertable.tsv.gz | cut -f2-5 | sort -S 8G -u | gzip > 2018-12-18-2237.09-matchcrossref.tsv.gz
diff --git a/notes/petabox_ia_metadata.txt b/notes/petabox_ia_metadata.txt
index df3bdc1..3a99805 100644
--- a/notes/petabox_ia_metadata.txt
+++ b/notes/petabox_ia_metadata.txt
@@ -14,7 +14,7 @@ Commands:
     ia-mine metamgr-journals-loose.20181218.items > journals.20181218.json
 
     export LC_ALL=C
-    cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -u > journals-ia.20181218.pdf-sha1.tsv
+    cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -S 4G -u > journals-ia.20181218.pdf-sha1.tsv
 
 Size/results:
author	Bryan Newbold <bnewbold@archive.org>	2019-02-01 15:13:32 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2019-02-01 15:13:32 -0800
commit	52967e05d2c8febdaa0426634fa987eaf5f58577 (patch)
tree	da12fd9c2f1ea3d517246a60dbc1467eb0ad748f /notes
parent	8901138485d1da4eb9a2512268faaa27fdf567c5 (diff)
download	sandcrawler-52967e05d2c8febdaa0426634fa987eaf5f58577.tar.gz sandcrawler-52967e05d2c8febdaa0426634fa987eaf5f58577.zip