diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-17 17:19:34 +0100 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-17 17:19:34 +0100 |
commit | 54dabe601eaa19d0495d9a102b34e9daa056457d (patch) | |
tree | 392e3ba4fa6a6c9d4fdda2de0e7b4656ead18f83 /pig/filter-cdx-pdfs.pig | |
parent | 04e1ae4f903af98ef174be9110aaae5e1ab81360 (diff) | |
download | sandcrawler-54dabe601eaa19d0495d9a102b34e9daa056457d.tar.gz sandcrawler-54dabe601eaa19d0495d9a102b34e9daa056457d.zip |
new/additional GWB CDX filter scripts
Diffstat (limited to 'pig/filter-cdx-pdfs.pig')
-rw-r--r-- | pig/filter-cdx-pdfs.pig | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/pig/filter-cdx-pdfs.pig b/pig/filter-cdx-pdfs.pig new file mode 100644 index 0000000..a2882ac --- /dev/null +++ b/pig/filter-cdx-pdfs.pig @@ -0,0 +1,24 @@ + +-- Tries to filter down a large CDX file (GWB index) to a subset of PDFs, by mimetype. +-- +-- Author: Bryan Newbold <bnewbold@archive.org> +-- Date: May 2018 + +%default INPUT '' +%default OUTPUT '' + +set mapreduce.job.queuename default + +cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FILTER cdx BY not url matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*pdf.*'; +cdx = ORDER cdx by url, timestamp PARALLEL 50; +cdx = FOREACH cdx GENERATE cdxline; +STORE cdx INTO '$OUTPUT' USING PigStorage(' '); + |