aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-19 01:22:54 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-19 01:22:54 +0200
commit024132dfb8617fbc255e0423319e8692a6867370 (patch)
tree529a97d4db600d10bc27abf14c0c22711d542d3f /python
parent41227ef89d3919ba160a9a4e42c7e70a39fa30ed (diff)
downloadrefcat-024132dfb8617fbc255e0423319e8692a6867370.tar.gz
refcat-024132dfb8617fbc255e0423319e8692a6867370.zip
tasks: update note
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index ec4d8a2..a5e5c72 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -299,6 +299,10 @@ class URLTabs(Refcat):
class URLList(Refcat):
"""
List of cleaned, unique URLs from refs.
+
+ For CDX lookup, we just want ^http, so:
+
+ $ zstdcat -T0 date-2021-05-06.tsv.zst | grep ^http > fatcat-refs-urllist-2021-05-06.tsv
"""
def requires(self):
return URLTabs()