aboutsummaryrefslogtreecommitdiffstats
path: root/pig/filter-cdx-ps.pig
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-03-29 21:50:06 -0700
committerBryan Newbold <bnewbold@archive.org>2018-03-29 21:50:06 -0700
commit67e0a765749a4754ed353fe30c8e771d136322a4 (patch)
tree1e8d656ecc6f4830e5a3e787ba099f871a8137fa /pig/filter-cdx-ps.pig
parentcb1582c44a000983a2150679c51b1baf22c09778 (diff)
downloadsandcrawler-67e0a765749a4754ed353fe30c8e771d136322a4.tar.gz
sandcrawler-67e0a765749a4754ed353fe30c8e771d136322a4.zip
import WIP on pig test setup
Diffstat (limited to 'pig/filter-cdx-ps.pig')
-rw-r--r--pig/filter-cdx-ps.pig18
1 files changed, 18 insertions, 0 deletions
diff --git a/pig/filter-cdx-ps.pig b/pig/filter-cdx-ps.pig
new file mode 100644
index 0000000..6e80acc
--- /dev/null
+++ b/pig/filter-cdx-ps.pig
@@ -0,0 +1,18 @@
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FILTER cdx BY not url matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*postscript.*';
+cdx = ORDER cdx by url, timestamp PARALLEL 50;
+cdx = FOREACH cdx GENERATE cdxline;
+STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
+