aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-02 22:06:55 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-03 00:02:55 +0200
commite69e181a27ae868a5c9fdc2e63c6356b9319b7b2 (patch)
treee83fb36e8ebeffe1524826789d9b2c464bd110fc
parent6a11c515df6b9d7c707d96dac2b52098153e39f4 (diff)
downloadrefcat-e69e181a27ae868a5c9fdc2e63c6356b9319b7b2.tar.gz
refcat-e69e181a27ae868a5c9fdc2e63c6356b9319b7b2.zip
parallel is set to min(num cpus, 8) by default
> Set the number of sorts run in parallel to n. By default, n is set to the number of available processors, but limited to 8, as there are diminishing performance gains after that. Note also that using n threads increases the memory usage by a factor of log n. http://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
-rw-r--r--python/refcat/tasks.py42
1 files changed, 21 insertions, 21 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 964555a..48c4226 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -457,7 +457,7 @@ class URLTabs(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ru -skip-on-empty 3 |
- LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k3,3 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -480,7 +480,7 @@ class URLTabsCleaned(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-cleanup -c url -allow http,https -X -B -S -f 3 |
- LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k3,3 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -504,7 +504,7 @@ class URLList(Refcat):
zstdcat -T0 {input} |
cut -f 3 |
skate-cleanup -X -c url -B -S -f 1 |
- LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% |
LC_ALL=C grep -E '^https?://' |
zstd -T0 -c > {output}
""",
@@ -535,7 +535,7 @@ class RefsDOI(Refcat):
zstdcat -T0 {input} |
skate-map -m ff -x biblio.doi -skip-on-empty 1 |
skate-cleanup -S -c doi -f 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -559,7 +559,7 @@ class RefsPMID(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x biblio.pmid -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -583,7 +583,7 @@ class RefsPMCID(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x biblio.pmcid -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -606,7 +606,7 @@ class RefsArxiv(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x biblio.arxiv_id -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -636,7 +636,7 @@ class FatcatDOI(Refcat):
zstdcat -T0 {input} |
skate-map -m ff -x ext_ids.doi -skip-on-empty 1 |
skate-cleanup -S -c doi -f 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -659,7 +659,7 @@ class FatcatPMID(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x ext_ids.pmid -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -682,7 +682,7 @@ class FatcatPMCID(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x ext_ids.pmcid -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -705,7 +705,7 @@ class FatcatArxiv(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x extra.arxiv.base_id -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -736,7 +736,7 @@ class FatcatMapped(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m {mapper} -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
mapper=self.mapper,
@@ -782,7 +782,7 @@ class RefsMapped(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m {mapper} -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -943,7 +943,7 @@ class OpenLibraryEditionsByWork(Refcat):
zstdcat -T0 {input} |
cut -f 5 |
skate-map -skip-on-empty 1 -m ff -x 'works.0.key' |
- LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
zstd -T0 -c > {output}
""",
tmpdir=self.tmpdir,
@@ -965,7 +965,7 @@ class OpenLibraryWorksSorted(Refcat):
output = shellout("""
zstdcat -T0 {input} |
cut -f 2,5 |
- LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
zstd -T0 -c > {output}
""",
tmpdir=self.tmpdir,
@@ -1047,7 +1047,7 @@ class OpenLibraryEditionsMapped(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m {mapper} -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
n=self.n,
@@ -1102,7 +1102,7 @@ class UnmatchedMapped(Refcat):
zstdcat -T0 {input} |
skate-conv -f ref |
skate-map -m rcns -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
zstd -T0 -c > {output}
""",
tmpdir=self.tmpdir,
@@ -1185,7 +1185,7 @@ class OpenLibraryReleaseMapped(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m {mapper} -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
mapper=self.mapper,
@@ -1247,7 +1247,7 @@ class BrefSortedByWorkID(Refcat):
output = shellout("""
zstdcat -T0 {bref} |
skate-map -B -m ff -x source_work_ident |
- LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 | zstd -c -T0 > {output}
+ LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output}
""",
tmpdir=self.tmpdir,
bref=self.input().path)
@@ -1271,7 +1271,7 @@ class RefsByWorkID(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x work_ident |
- LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
zstd -c -T0 > {output}
""",
tmpdir=self.tmpdir,
@@ -1376,7 +1376,7 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m vcns -skip-on-empty 1 |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --parallel 4 |
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
zstd -T0 -c > {output}
""",
tmpdir=self.tmpdir,