diff options
-rw-r--r-- | python/refcat/cli.py | 1 | ||||
-rw-r--r-- | python/refcat/tasks.py | 34 |
2 files changed, 32 insertions, 3 deletions
diff --git a/python/refcat/cli.py b/python/refcat/cli.py index 61b44b9..d4b5b81 100644 --- a/python/refcat/cli.py +++ b/python/refcat/cli.py @@ -146,6 +146,7 @@ def ll(): except TaskClassNotFoundException as exc: print("no such task") + def deps(): """ Render task dependencies. diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 09e88c4..a3bf458 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -197,9 +197,9 @@ TODO: Unmatched Idea was that "source release ident + ref index" should allow completeness and order. "crossref" and "grobid" order may vary. -In any way, we want the raw ref blob sorted by release ident - it's already -sorted by work ident. We do have a work ident for all brefs as well, so we need -to sort the combined bref blob by work id. +In any way, we may want the raw ref blob sorted by (source) release ident - +it's already sorted by work ident. We do have a work ident for all brefs as +well, so we need to sort the combined bref blob by work id. bref blob raw ref blob work_id work_id @@ -460,6 +460,7 @@ class URLTabs(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + class URLTabsCleaned(Refcat): """ URLTabs, cleaned, unsorted. @@ -482,6 +483,7 @@ class URLTabsCleaned(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + class URLList(Refcat): """ List of mostly cleaned, unique URLs from refs. @@ -1219,6 +1221,32 @@ class Bref(Refcat): # +# Final Assembly +# -------------- +# + + +class BrefSortedByWorkID(Refcat): + """ + Sort by work id. + """ + def requires(self): + return Bref() + + def run(self): + shellout(""" + zstdcat -T0 {bref} | + skate-map -m ff -x source_work_ident | + sort -S 25 % -k1,1 > zstd -c -T0 > {output} + """, + bref=self.input().path) + luigi.LocalTarget(tmpf).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + + +# # Extra # ----- # |