From 54dabe601eaa19d0495d9a102b34e9daa056457d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Oct 2019 17:19:34 +0100 Subject: new/additional GWB CDX filter scripts --- pig/filter-cdx-tarball.pig | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 pig/filter-cdx-tarball.pig (limited to 'pig/filter-cdx-tarball.pig') diff --git a/pig/filter-cdx-tarball.pig b/pig/filter-cdx-tarball.pig new file mode 100644 index 0000000..d0be0f7 --- /dev/null +++ b/pig/filter-cdx-tarball.pig @@ -0,0 +1,38 @@ + +-- Tries to filter down a large CDX file (GWB index) to a subset of tarballs +-- (.tar.gz). Intention is to find software code that isn't in, eg, git. +-- +-- Author: Bryan Newbold +-- Date: May 2018 + + +%default INPUT '' +%default OUTPUT '' + +set mapreduce.job.queuename default + +cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; +cdx = FILTER cdx BY not surt matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*(octet|gzip|gtar|tgz).*'; + +-- This is the core regex +cdx = FILTER cdx + -- .tar.gz in URL + BY surt matches '(?i).+\\).*\\.tar\\.gz.*'; + +-- DISTINCT by sha1 column +cdx_uniq = FOREACH (GROUP cdx BY sha1sum) { + r = TOP(1, 0, $1); + GENERATE FLATTEN(r); +}; + +cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50; +cdx_uniq = FOREACH cdx_uniq GENERATE cdxline; +STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' '); + -- cgit v1.2.3