From 67e0a765749a4754ed353fe30c8e771d136322a4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Mar 2018 21:50:06 -0700 Subject: import WIP on pig test setup --- pig/filter-cdx-ps.pig | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 pig/filter-cdx-ps.pig (limited to 'pig/filter-cdx-ps.pig') diff --git a/pig/filter-cdx-ps.pig b/pig/filter-cdx-ps.pig new file mode 100644 index 0000000..6e80acc --- /dev/null +++ b/pig/filter-cdx-ps.pig @@ -0,0 +1,18 @@ +%default INPUT '' +%default OUTPUT '' + +set mapreduce.job.queuename default + +cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FILTER cdx BY not url matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*postscript.*'; +cdx = ORDER cdx by url, timestamp PARALLEL 50; +cdx = FOREACH cdx GENERATE cdxline; +STORE cdx INTO '$OUTPUT' USING PigStorage(' '); + -- cgit v1.2.3