aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-15 18:03:36 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-15 18:03:36 +0200
commit2269340c75402260ca92572371477c80cff4bdac (patch)
tree7858ff5d7c4f2e69524772eb66b7d5ce0b6ae666 /python
parent014409d04a36d2943bb7532ab0caee667bb628c6 (diff)
downloadrefcat-2269340c75402260ca92572371477c80cff4bdac.tar.gz
refcat-2269340c75402260ca92572371477c80cff4bdac.zip
tasks: tweak CDXURL
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 4a4a357..a7a3ad9 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1500,15 +1500,17 @@ class CDXURL(Refcat):
limit = luigi.IntParameter(default=10000, significant=False)
def requires(self):
- return URLList()
+ return RefsURL()
def run(self):
output = shellout("""
zstdcat -T0 {input} |
- head -n {limit} |
+ LC_ALL=C cut -f 1 |
+ LC_ALL=C head -n {limit} |
skate-cdx-lookup -q -s 50ms -c {cache} -j -B |
skate-map -m cdxu |
- LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -c -T0 > {output}
+ LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
+ zstd -c -T0 > {output}
""",
limit=self.limit,
input=self.input().path,