diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 16:58:09 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 16:58:09 +0000 |
commit | 64503ce8fb755384623821bfabfa81bbb37d8f6e (patch) | |
tree | dee6da085492795f48f53b5d43eaa162d5c125fd /pig | |
parent | 681b085bc2a090b8db366c54780f1ec81d811403 (diff) | |
download | sandcrawler-64503ce8fb755384623821bfabfa81bbb37d8f6e.tar.gz sandcrawler-64503ce8fb755384623821bfabfa81bbb37d8f6e.zip |
pig cdx join improvements
Diffstat (limited to 'pig')
-rw-r--r-- | pig/filter-cdx-join-urls.pig | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig index b396c82..70858b9 100644 --- a/pig/filter-cdx-join-urls.pig +++ b/pig/filter-cdx-join-urls.pig @@ -18,7 +18,7 @@ surts = FOREACH urls GENERATE SURTURL(url) AS url_surt; surts = ORDER surts by url_surt ASC PARALLEL 10; surts = DISTINCT surts; -cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = LOAD '$INPUT_CDX' AS cdxline:chararray; cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); |