aboutsummaryrefslogtreecommitdiffstats
path: root/pig/filter-cdx-pdfs.pig
blob: a2882acbbe9168e77bdf5124d728936548dd6be3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

-- Tries to filter down a large CDX file (GWB index) to a subset of PDFs, by mimetype.
--
-- Author: Bryan Newbold <bnewbold@archive.org>
-- Date: May 2018

%default INPUT ''
%default OUTPUT ''

set mapreduce.job.queuename default

cdx = LOAD '$INPUT' AS cdxline:chararray;
cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');

cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
cdx = FILTER cdx BY not url matches '-';
cdx = FILTER cdx BY httpstatus matches '200';
cdx = FILTER cdx BY mimetype matches '.*pdf.*';
cdx = ORDER cdx by url, timestamp PARALLEL 50;
cdx = FOREACH cdx GENERATE cdxline;
STORE cdx INTO '$OUTPUT' USING PigStorage(' ');