sort raw refs, too

author: Martin Czygan <martin.czygan@gmail.com> 2021-06-12 02:10:37 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-06-12 02:10:37 +0200
commit: fdff68e5aa4c228cd6b0962bee53be39a3c306ad (patch)
tree: 113df5e2df9d4848a8939aa89387eb67f96ad0e9
parent: e3e444ce9cd7831074b308bf3b24096446987848 (diff)
download: refcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.tar.gz
refcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.zip
2 files changed, 5 insertions, 1 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 362f61b..61be606 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1251,6 +1251,8 @@ class RefsByWorkID(Refcat):
     """
     Key raw refs by work id. Since data is already sorted by work id, this can
     skip the sorting step. 174m13.837s (~170K extractions/s).
+
+    Seems, ordering is off, e.g. BrefSortedByWorkID starts with "22222dgdnzgxpmeq77nyyuj2x4".
     """
     def requires(self):
         return Refs()
@@ -1259,8 +1261,10 @@ class RefsByWorkID(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x work_ident |
+                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --parallel 4 |
                           zstd -c -T0 > {output}
                           """,
+                          tmpdir=self.tmpdir,
                           input=self.input().path)
         luigi.LocalTarget(output).move(self.output().path)
 
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
index 2da7246..619a3fc 100644
--- a/skate/cmd/skate-reduce/main.go
+++ b/skate/cmd/skate-reduce/main.go
@@ -53,7 +53,7 @@
 //             |
 // * unmatched | join matched and unmatched reference data; do deduplicate on the fly
 //             |
-//             | $ skate-reduce -m unmatched -M a.ndj -R b
+//             | $ skate-reduce -m unmatched -B a.ndj -R b.ndj
 //
 package main
author	Martin Czygan <martin.czygan@gmail.com>	2021-06-12 02:10:37 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-06-12 02:10:37 +0200
commit	fdff68e5aa4c228cd6b0962bee53be39a3c306ad (patch)
tree	113df5e2df9d4848a8939aa89387eb67f96ad0e9
parent	e3e444ce9cd7831074b308bf3b24096446987848 (diff)
download	refcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.tar.gz refcat-fdff68e5aa4c228cd6b0962bee53be39a3c306ad.zip