From 6e8305e625f8b033d2697d40ed31ec15368678f9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Dec 2018 13:33:41 +0800 Subject: update notes --- notes/crawl_cdx_merge.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'notes') diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md index a843a8d..1d744f5 100644 --- a/notes/crawl_cdx_merge.md +++ b/notes/crawl_cdx_merge.md @@ -1,6 +1,19 @@ -## Old Way +## New Way + +Run script from scratch repo: + + ~/scratch/bin/cdx_collection.py CRAWL-2000 + + zcat CRAWL-2000.cdx.gz | wc -l + # update crawl README/ANALYSIS/whatever + +Assuming we're just looking at PDFs: + + zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -u | gzip > CRAWL-2000.sorted.cdx.gz + +## Old Way Use metamgr to export an items list. -- cgit v1.2.3