From 64503ce8fb755384623821bfabfa81bbb37d8f6e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 May 2018 16:58:09 +0000 Subject: pig cdx join improvements --- pig/filter-cdx-join-urls.pig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig index b396c82..70858b9 100644 --- a/pig/filter-cdx-join-urls.pig +++ b/pig/filter-cdx-join-urls.pig @@ -18,7 +18,7 @@ surts = FOREACH urls GENERATE SURTURL(url) AS url_surt; surts = ORDER surts by url_surt ASC PARALLEL 10; surts = DISTINCT surts; -cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = LOAD '$INPUT_CDX' AS cdxline:chararray; cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); -- cgit v1.2.3