aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-14 00:43:01 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-14 00:43:01 +0200
commitb7202cc09b67f5a1e58576b9c0feac7baae387c2 (patch)
treec027fe59dfe02495a8a911d9d86bab7574e93a38 /python
parent5eb527be3f34dbadae698f3ece164e34c031cb91 (diff)
downloadrefcat-b7202cc09b67f5a1e58576b9c0feac7baae387c2.tar.gz
refcat-b7202cc09b67f5a1e58576b9c0feac7baae387c2.zip
tasks: only include docs with a work id
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py6
1 files changed, 2 insertions, 4 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 21335f3..6ab8d06 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1258,9 +1258,7 @@ class Bref(Refcat):
class BrefSortedByWorkID(Refcat):
"""
- Sort by work id. 237m45.094s.
-
- Final file currently has: 915168340 docs.
+ Sort by work id. Keep only docs that actually have a work id. 237m45.094s.
"""
def requires(self):
return Bref()
@@ -1268,7 +1266,7 @@ class BrefSortedByWorkID(Refcat):
def run(self):
output = shellout("""
zstdcat -T0 {bref} |
- skate-map -B -m ff -x source_work_ident |
+ skate-map -skip-on-empty 1 -B -m ff -x source_work_ident |
LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output}
""",
tmpdir=self.tmpdir,