aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-12 02:10:37 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-12 02:10:37 +0200
commitfdff68e5aa4c228cd6b0962bee53be39a3c306ad (patch)
tree113df5e2df9d4848a8939aa89387eb67f96ad0e9 /python
parente3e444ce9cd7831074b308bf3b24096446987848 (diff)
downloadrefcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.tar.gz
refcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.zip
sort raw refs, too
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 362f61b..61be606 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1251,6 +1251,8 @@ class RefsByWorkID(Refcat):
"""
Key raw refs by work id. Since data is already sorted by work id, this can
skip the sorting step. 174m13.837s (~170K extractions/s).
+
+ Seems, ordering is off, e.g. BrefSortedByWorkID starts with "22222dgdnzgxpmeq77nyyuj2x4".
"""
def requires(self):
return Refs()
@@ -1259,8 +1261,10 @@ class RefsByWorkID(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-map -m ff -x work_ident |
+ LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 |
zstd -c -T0 > {output}
""",
+ tmpdir=self.tmpdir,
input=self.input().path)
luigi.LocalTarget(output).move(self.output().path)