diff options
-rw-r--r-- | python/refcat/tasks.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index ec4d8a2..a5e5c72 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -299,6 +299,10 @@ class URLTabs(Refcat): class URLList(Refcat): """ List of cleaned, unique URLs from refs. + + For CDX lookup, we just want ^http, so: + + $ zstdcat -T0 date-2021-05-06.tsv.zst | grep ^http > fatcat-refs-urllist-2021-05-06.tsv """ def requires(self): return URLTabs() |