diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-02 22:06:55 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-03 00:02:55 +0200 |
commit | e69e181a27ae868a5c9fdc2e63c6356b9319b7b2 (patch) | |
tree | e83fb36e8ebeffe1524826789d9b2c464bd110fc | |
parent | 6a11c515df6b9d7c707d96dac2b52098153e39f4 (diff) | |
download | refcat-e69e181a27ae868a5c9fdc2e63c6356b9319b7b2.tar.gz refcat-e69e181a27ae868a5c9fdc2e63c6356b9319b7b2.zip |
parallel is set to min(num cpus, 8) by default
> Set the number of sorts run in parallel to n. By default, n is set to
the number of available processors, but limited to 8, as there are
diminishing performance gains after that. Note also that using n threads
increases the memory usage by a factor of log n.
http://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
-rw-r--r-- | python/refcat/tasks.py | 42 |
1 files changed, 21 insertions, 21 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 964555a..48c4226 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -457,7 +457,7 @@ class URLTabs(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ru -skip-on-empty 3 | - LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k3,3 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -480,7 +480,7 @@ class URLTabsCleaned(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-cleanup -c url -allow http,https -X -B -S -f 3 | - LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k3,3 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -504,7 +504,7 @@ class URLList(Refcat): zstdcat -T0 {input} | cut -f 3 | skate-cleanup -X -c url -B -S -f 1 | - LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% | LC_ALL=C grep -E '^https?://' | zstd -T0 -c > {output} """, @@ -535,7 +535,7 @@ class RefsDOI(Refcat): zstdcat -T0 {input} | skate-map -m ff -x biblio.doi -skip-on-empty 1 | skate-cleanup -S -c doi -f 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -559,7 +559,7 @@ class RefsPMID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x biblio.pmid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -583,7 +583,7 @@ class RefsPMCID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x biblio.pmcid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -606,7 +606,7 @@ class RefsArxiv(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x biblio.arxiv_id -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -636,7 +636,7 @@ class FatcatDOI(Refcat): zstdcat -T0 {input} | skate-map -m ff -x ext_ids.doi -skip-on-empty 1 | skate-cleanup -S -c doi -f 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -659,7 +659,7 @@ class FatcatPMID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x ext_ids.pmid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -682,7 +682,7 @@ class FatcatPMCID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x ext_ids.pmcid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -705,7 +705,7 @@ class FatcatArxiv(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x extra.arxiv.base_id -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -736,7 +736,7 @@ class FatcatMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, mapper=self.mapper, @@ -782,7 +782,7 @@ class RefsMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -943,7 +943,7 @@ class OpenLibraryEditionsByWork(Refcat): zstdcat -T0 {input} | cut -f 5 | skate-map -skip-on-empty 1 -m ff -x 'works.0.key' | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, @@ -965,7 +965,7 @@ class OpenLibraryWorksSorted(Refcat): output = shellout(""" zstdcat -T0 {input} | cut -f 2,5 | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, @@ -1047,7 +1047,7 @@ class OpenLibraryEditionsMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -1102,7 +1102,7 @@ class UnmatchedMapped(Refcat): zstdcat -T0 {input} | skate-conv -f ref | skate-map -m rcns -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, @@ -1185,7 +1185,7 @@ class OpenLibraryReleaseMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, mapper=self.mapper, @@ -1247,7 +1247,7 @@ class BrefSortedByWorkID(Refcat): output = shellout(""" zstdcat -T0 {bref} | skate-map -B -m ff -x source_work_ident | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 | zstd -c -T0 > {output} + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output} """, tmpdir=self.tmpdir, bref=self.input().path) @@ -1271,7 +1271,7 @@ class RefsByWorkID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x work_ident | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output} """, tmpdir=self.tmpdir, @@ -1376,7 +1376,7 @@ class UnmatchedResolveJournalNamesMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m vcns -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, |