aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-08 16:58:09 +0000
committerBryan Newbold <bnewbold@archive.org>2018-05-08 16:58:09 +0000
commit64503ce8fb755384623821bfabfa81bbb37d8f6e (patch)
treedee6da085492795f48f53b5d43eaa162d5c125fd
parent681b085bc2a090b8db366c54780f1ec81d811403 (diff)
downloadsandcrawler-64503ce8fb755384623821bfabfa81bbb37d8f6e.tar.gz
sandcrawler-64503ce8fb755384623821bfabfa81bbb37d8f6e.zip
pig cdx join improvements
-rw-r--r--pig/filter-cdx-join-urls.pig2
1 files changed, 1 insertions, 1 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig
index b396c82..70858b9 100644
--- a/pig/filter-cdx-join-urls.pig
+++ b/pig/filter-cdx-join-urls.pig
@@ -18,7 +18,7 @@ surts = FOREACH urls GENERATE SURTURL(url) AS url_surt;
surts = ORDER surts by url_surt ASC PARALLEL 10;
surts = DISTINCT surts;
-cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = LOAD '$INPUT_CDX' AS cdxline:chararray;
cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');