diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 6 |
1 files changed, 2 insertions, 4 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 21335f3..6ab8d06 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1258,9 +1258,7 @@ class Bref(Refcat): class BrefSortedByWorkID(Refcat): """ - Sort by work id. 237m45.094s. - - Final file currently has: 915168340 docs. + Sort by work id. Keep only docs that actually have a work id. 237m45.094s. """ def requires(self): return Bref() @@ -1268,7 +1266,7 @@ class BrefSortedByWorkID(Refcat): def run(self): output = shellout(""" zstdcat -T0 {bref} | - skate-map -B -m ff -x source_work_ident | + skate-map -skip-on-empty 1 -B -m ff -x source_work_ident | LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output} """, tmpdir=self.tmpdir, |