aboutsummaryrefslogtreecommitdiffstats
path: root/pig
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-07 23:41:10 -0700
committerBryan Newbold <bnewbold@archive.org>2018-05-07 23:41:10 -0700
commit2a1c887309305187d785b34a16c1868d26cb3273 (patch)
tree56b799ad5505245e5f8a4d08a321eece728510ef /pig
parente566ee1b4e134bfc06284cf77d8d1370df30d53f (diff)
downloadsandcrawler-2a1c887309305187d785b34a16c1868d26cb3273.tar.gz
sandcrawler-2a1c887309305187d785b34a16c1868d26cb3273.zip
WIP on filter-cdx-join-urls.pig
Diffstat (limited to 'pig')
-rw-r--r--pig/filter-cdx-join-urls.pig37
1 files changed, 37 insertions, 0 deletions
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig
new file mode 100644
index 0000000..b396c82
--- /dev/null
+++ b/pig/filter-cdx-join-urls.pig
@@ -0,0 +1,37 @@
+
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
+%default INPUT_CDX ''
+%default INPUT_URLS ''
+%default OUTPUT ''
+
+REGISTER /home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar;
+REGISTER /home/webcrawl/pig-scripts/jars/pigtools.jar;
+DEFINE SURTURL pigtools.SurtUrlKey();
+
+set mapreduce.job.queuename default
+
+urls = LOAD '$INPUT_URLS' USING PigStorage() AS url:chararray;
+surts = FOREACH urls GENERATE SURTURL(url) AS url_surt;
+surts = ORDER surts by url_surt ASC PARALLEL 10;
+surts = DISTINCT surts;
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FILTER cdx BY not cdx_surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*pdf.*';
+
+-- Core JOIN
+full_join = JOIN cdx BY cdx_surt, surts BY url_surt;
+
+result = FOREACH full_join GENERATE cdxline;
+result = DISTINCT result;
+
+STORE result INTO '$OUTPUT' USING PigStorage();