From 41227ef89d3919ba160a9a4e42c7e70a39fa30ed Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 19 May 2021 00:55:21 +0200 Subject: pig: full join notes --- pig/README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pig/README.md b/pig/README.md index c186e8e..d13c065 100644 --- a/pig/README.md +++ b/pig/README.md @@ -26,7 +26,7 @@ A test run with a single file. $ pig -p INPUT_CDX=/user/wmdata2/cdx-all-index/20210422171221/part-a-00031.gz -p INPUT_URLS=/user/martin/fatcat-refs-urllist-2021-05-06.tsv -p OUTPUT=/user/martin/fatcat-refs-lookup-0 ``` -* ia802401.us.archive.org:6988/ +* http://ia802401.us.archive.org:6988/ * http://ia802401.us.archive.org:6988/cluster/app/application_1611217683160_298042 * http://ia802401.us.archive.org:6988/proxy/application_1611217683160_298042/ @@ -55,6 +55,14 @@ $ jq -rc .status refs_links_liveweb.json | sort | uniq -c | sort -nr 2> /dev/nul 10 null ``` +Running against a full index. + +``` +$ time pig -p INPUT_CDX=/user/wmdata2/cdx-all-index/20210321055100/part-a-*.gz -p INPUT_URLS=/user/martin/fatcat-refs-urllist-2021-05-06.tsv -p OUTPUT=/user/martin/fatcat-refs-lookup-1 filter-cdx-join-urls.pig +``` + +* http://ia802401.us.archive.org:6988/proxy/application_1611217683160_300026/ + ---- -- cgit v1.2.3