diff options
-rw-r--r-- | pig/filter-cdx-join-urls.pig | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig new file mode 100644 index 0000000..b396c82 --- /dev/null +++ b/pig/filter-cdx-join-urls.pig @@ -0,0 +1,37 @@ + +-- +-- Author: Bryan Newbold <bnewbold@archive.org> +-- Date: May 2018 + +%default INPUT_CDX '' +%default INPUT_URLS '' +%default OUTPUT '' + +REGISTER /home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar; +REGISTER /home/webcrawl/pig-scripts/jars/pigtools.jar; +DEFINE SURTURL pigtools.SurtUrlKey(); + +set mapreduce.job.queuename default + +urls = LOAD '$INPUT_URLS' USING PigStorage() AS url:chararray; +surts = FOREACH urls GENERATE SURTURL(url) AS url_surt; +surts = ORDER surts by url_surt ASC PARALLEL 10; +surts = DISTINCT surts; + +cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FILTER cdx BY not cdx_surt matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*pdf.*'; + +-- Core JOIN +full_join = JOIN cdx BY cdx_surt, surts BY url_surt; + +result = FOREACH full_join GENERATE cdxline; +result = DISTINCT result; + +STORE result INTO '$OUTPUT' USING PigStorage(); |