diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-12 02:10:37 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-12 02:10:37 +0200 | 
| commit | fdff68e5aa4c228cd6b0962bee53be39a3c306ad (patch) | |
| tree | 113df5e2df9d4848a8939aa89387eb67f96ad0e9 /python | |
| parent | e3e444ce9cd7831074b308bf3b24096446987848 (diff) | |
| download | refcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.tar.gz refcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.zip | |
sort raw refs, too
Diffstat (limited to 'python')
| -rw-r--r-- | python/refcat/tasks.py | 4 | 
1 files changed, 4 insertions, 0 deletions
| diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 362f61b..61be606 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1251,6 +1251,8 @@ class RefsByWorkID(Refcat):      """      Key raw refs by work id. Since data is already sorted by work id, this can      skip the sorting step. 174m13.837s (~170K extractions/s). + +    Seems, ordering is off, e.g. BrefSortedByWorkID starts with "22222dgdnzgxpmeq77nyyuj2x4".      """      def requires(self):          return Refs() @@ -1259,8 +1261,10 @@ class RefsByWorkID(Refcat):          output = shellout("""                            zstdcat -T0 {input} |                            skate-map -m ff -x work_ident | +                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 |                            zstd -c -T0 > {output}                            """, +                          tmpdir=self.tmpdir,                            input=self.input().path)          luigi.LocalTarget(output).move(self.output().path) | 
