From 024132dfb8617fbc255e0423319e8692a6867370 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 19 May 2021 01:22:54 +0200 Subject: tasks: update note --- python/refcat/tasks.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index ec4d8a2..a5e5c72 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -299,6 +299,10 @@ class URLTabs(Refcat): class URLList(Refcat): """ List of cleaned, unique URLs from refs. + + For CDX lookup, we just want ^http, so: + + $ zstdcat -T0 date-2021-05-06.tsv.zst | grep ^http > fatcat-refs-urllist-2021-05-06.tsv """ def requires(self): return URLTabs() -- cgit v1.2.3