From ef8ccdc83e2817a85befab229e11b5a4e34b302e Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 9 Apr 2021 23:57:57 +0200 Subject: pig stubs --- pig/README.md | 52 +++++++++++++++++++++++++++++++++++++++++++- pig/data.txt | 21 ++++++++++++++++++ pig/dump.pig | 3 +++ pig/filter-cdx-join-urls.pig | 43 ++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 pig/data.txt create mode 100644 pig/dump.pig create mode 100644 pig/filter-cdx-join-urls.pig diff --git a/pig/README.md b/pig/README.md index 7e59600..048c10e 100644 --- a/pig/README.md +++ b/pig/README.md @@ -1 +1,51 @@ -# README +# Notes + +In April 2021, we run pig 0.12 and hadoop 2.6.0-cdh5.14.4. + +Pig has a local mode for testing and debugging, `pig -x local script.pig`, only +pig needs to be installed and `JAVA_HOME` to be set. + +Additional jars can be loaded, e.g. + +* `/home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar` +* `/home/webcrawl/pig-scripts/jars/pigtools.jar` + +---- + +# Previous Notes (BN) + +As of March 2018, the archive runs Pig version 0.12.0, via CDH5.0.1 (Cloudera +Distribution). + +"Local mode" unit tests in this folder run with Pig version 0.17.0 (controlled +by `fetch_deps.sh`) due to [dependency/jar issues][pig-bug] in local mode of +0.12.0. + +[pig-bug]: https://issues.apache.org/jira/browse/PIG-3530 + +## Development and Testing + +To run tests, you need Java installed and `JAVA_HOME` configured. + +Fetch dependencies (including pig) from top-level directory: + + ./fetch_hadoop.sh + +Write `.pig` scripts in this directory, and add a python wrapper test to +`./tests/` when done. Test vector files (input/output) can go in +`./tests/files/`. + +Run the tests with: + + pipenv run pytest + +Could also, in theory, use a docker image ([local-pig][]), but it's pretty easy +to just download. + +[local-pig]: https://hub.docker.com/r/chalimartines/local-pig + +## Run in Production + + pig -param INPUT="/user/bnewbold/pdfs/global-20171227034923" \ + -param OUTPUT="/user/bnewbold/pdfs/gwb-pdf-20171227034923-surt-filter" \ + filter-cdx-paper-pdfs.pig diff --git a/pig/data.txt b/pig/data.txt new file mode 100644 index 0000000..2bb64d4 --- /dev/null +++ b/pig/data.txt @@ -0,0 +1,21 @@ +user2000:iN35WlzPBESum +user2001:BxqHxwxoROF9a +user2002:T6jA43R4WcFPd +user2003:ek5ISqj7lS4Yc +user2004:xuEZMZfUdRNdO +user2005:pE2b85XpILCTK +user2006:hJLSCa2nH54RS +user2007:4ElDDCxp59nQF +user2008:SK9HT3wzNyOF4 +user2009:v1cOjw1JLENpn +user2010:cn4TXlLexG9LB +user2011:U929yOkbqa8PW +user2012:Sr9TbbQOo2gwl +user2013:6IgoJ5dZfHNxQ +user2014:AEsF0pxKiIx39 +user2015:i4jRiBA0TWCgO +user2016:EDA1ZeP3wHJ9A +user2017:nWN9qI5rtPzWA +user2018:MmEboCRwpGR0e +user2019:ssWIFKt0MxIGx +user2020:29iuqg8zJiiEW diff --git a/pig/dump.pig b/pig/dump.pig new file mode 100644 index 0000000..0359f1a --- /dev/null +++ b/pig/dump.pig @@ -0,0 +1,3 @@ +A = load 'data.file' using PigStorage(':'); +B = foreach A generate $0 as id; +dump B; diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig new file mode 100644 index 0000000..3d06804 --- /dev/null +++ b/pig/filter-cdx-join-urls.pig @@ -0,0 +1,43 @@ + +-- +-- Author: Bryan Newbold +-- Date: May 2018 + +%default INPUT_CDX '' +%default INPUT_URLS '' +%default OUTPUT '' + +REGISTER /home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar; +REGISTER /home/webcrawl/pig-scripts/jars/pigtools.jar; +DEFINE SURTURL pigtools.SurtUrlKey(); + +set mapreduce.job.queuename default + +urls = LOAD '$INPUT_URLS' USING PigStorage() AS url:chararray; +surts = FOREACH urls GENERATE SURTURL(url) AS url_surt; +surts = ORDER surts by url_surt ASC PARALLEL 10; +surts = DISTINCT surts; + +cdx = LOAD '$INPUT_CDX' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline; +cdx = FILTER cdx BY not cdx_surt matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*pdf.*'; + +-- Core JOIN +full_join = JOIN cdx BY cdx_surt, surts BY url_surt; + +-- DISTINCT by sha1 column +full_uniq = FOREACH (GROUP full_join BY sha1sum) { + r = TOP(1, 0, $1); + GENERATE FLATTEN(r); +}; + +result = FOREACH full_uniq GENERATE cdxline; +result = DISTINCT result; + +STORE result INTO '$OUTPUT' USING PigStorage(); -- cgit v1.2.3