diff options
-rw-r--r-- | python/refcat/tasks.py | 4 | ||||
-rw-r--r-- | skate/cmd/skate-reduce/main.go | 2 |
2 files changed, 5 insertions, 1 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 362f61b..61be606 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1251,6 +1251,8 @@ class RefsByWorkID(Refcat): """ Key raw refs by work id. Since data is already sorted by work id, this can skip the sorting step. 174m13.837s (~170K extractions/s). + + Seems, ordering is off, e.g. BrefSortedByWorkID starts with "22222dgdnzgxpmeq77nyyuj2x4". """ def requires(self): return Refs() @@ -1259,8 +1261,10 @@ class RefsByWorkID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x work_ident | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 | zstd -c -T0 > {output} """, + tmpdir=self.tmpdir, input=self.input().path) luigi.LocalTarget(output).move(self.output().path) diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go index 2da7246..619a3fc 100644 --- a/skate/cmd/skate-reduce/main.go +++ b/skate/cmd/skate-reduce/main.go @@ -53,7 +53,7 @@ // | // * unmatched | join matched and unmatched reference data; do deduplicate on the fly // | -// | $ skate-reduce -m unmatched -M a.ndj -R b +// | $ skate-reduce -m unmatched -B a.ndj -R b.ndj // package main |