aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/refcat/cli.py1
-rw-r--r--python/refcat/tasks.py34
2 files changed, 32 insertions, 3 deletions
diff --git a/python/refcat/cli.py b/python/refcat/cli.py
index 61b44b9..d4b5b81 100644
--- a/python/refcat/cli.py
+++ b/python/refcat/cli.py
@@ -146,6 +146,7 @@ def ll():
except TaskClassNotFoundException as exc:
print("no such task")
+
def deps():
"""
Render task dependencies.
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 09e88c4..a3bf458 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -197,9 +197,9 @@ TODO: Unmatched
Idea was that "source release ident + ref index" should allow completeness and
order. "crossref" and "grobid" order may vary.
-In any way, we want the raw ref blob sorted by release ident - it's already
-sorted by work ident. We do have a work ident for all brefs as well, so we need
-to sort the combined bref blob by work id.
+In any way, we may want the raw ref blob sorted by (source) release ident -
+it's already sorted by work ident. We do have a work ident for all brefs as
+well, so we need to sort the combined bref blob by work id.
bref blob raw ref blob
work_id work_id
@@ -460,6 +460,7 @@ class URLTabs(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
class URLTabsCleaned(Refcat):
"""
URLTabs, cleaned, unsorted.
@@ -482,6 +483,7 @@ class URLTabsCleaned(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
class URLList(Refcat):
"""
List of mostly cleaned, unique URLs from refs.
@@ -1219,6 +1221,32 @@ class Bref(Refcat):
#
+# Final Assembly
+# --------------
+#
+
+
+class BrefSortedByWorkID(Refcat):
+ """
+ Sort by work id.
+ """
+ def requires(self):
+ return Bref()
+
+ def run(self):
+ shellout("""
+ zstdcat -T0 {bref} |
+ skate-map -m ff -x source_work_ident |
+ sort -S 25 % -k1,1 > zstd -c -T0 > {output}
+ """,
+ bref=self.input().path)
+ luigi.LocalTarget(tmpf).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
+
+#
# Extra
# -----
#