aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--pig/README.md52
-rw-r--r--pig/data.txt21
-rw-r--r--pig/dump.pig3
-rw-r--r--pig/filter-cdx-join-urls.pig43
4 files changed, 118 insertions, 1 deletions
diff --git a/pig/README.md b/pig/README.md
index 7e59600..048c10e 100644
--- a/pig/README.md
+++ b/pig/README.md
@@ -1 +1,51 @@
-# README
+# Notes
+
+In April 2021, we run pig 0.12 and hadoop 2.6.0-cdh5.14.4.
+
+Pig has a local mode for testing and debugging, `pig -x local script.pig`, only
+pig needs to be installed and `JAVA_HOME` to be set.
+
+Additional jars can be loaded, e.g.
+
+* `/home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar`
+* `/home/webcrawl/pig-scripts/jars/pigtools.jar`
+
+----
+
+# Previous Notes (BN)
+
+As of March 2018, the archive runs Pig version 0.12.0, via CDH5.0.1 (Cloudera
+Distribution).
+
+"Local mode" unit tests in this folder run with Pig version 0.17.0 (controlled
+by `fetch_deps.sh`) due to [dependency/jar issues][pig-bug] in local mode of
+0.12.0.
+
+[pig-bug]: https://issues.apache.org/jira/browse/PIG-3530
+
+## Development and Testing
+
+To run tests, you need Java installed and `JAVA_HOME` configured.
+
+Fetch dependencies (including pig) from top-level directory:
+
+ ./fetch_hadoop.sh
+
+Write `.pig` scripts in this directory, and add a python wrapper test to
+`./tests/` when done. Test vector files (input/output) can go in
+`./tests/files/`.
+
+Run the tests with:
+
+ pipenv run pytest
+
+Could also, in theory, use a docker image ([local-pig][]), but it's pretty easy
+to just download.
+
+[local-pig]: https://hub.docker.com/r/chalimartines/local-pig
+
+## Run in Production
+
+ pig -param INPUT="/user/bnewbold/pdfs/global-20171227034923" \
+ -param OUTPUT="/user/bnewbold/pdfs/gwb-pdf-20171227034923-surt-filter" \
+ filter-cdx-paper-pdfs.pig
diff --git a/pig/data.txt b/pig/data.txt
new file mode 100644
index 0000000..2bb64d4
--- /dev/null
+++ b/pig/data.txt
@@ -0,0 +1,21 @@
+user2000:iN35WlzPBESum
+user2001:BxqHxwxoROF9a
+user2002:T6jA43R4WcFPd
+user2003:ek5ISqj7lS4Yc
+user2004:xuEZMZfUdRNdO
+user2005:pE2b85XpILCTK
+user2006:hJLSCa2nH54RS
+user2007:4ElDDCxp59nQF
+user2008:SK9HT3wzNyOF4
+user2009:v1cOjw1JLENpn
+user2010:cn4TXlLexG9LB
+user2011:U929yOkbqa8PW
+user2012:Sr9TbbQOo2gwl
+user2013:6IgoJ5dZfHNxQ
+user2014:AEsF0pxKiIx39
+user2015:i4jRiBA0TWCgO
+user2016:EDA1ZeP3wHJ9A
+user2017:nWN9qI5rtPzWA
+user2018:MmEboCRwpGR0e
+user2019:ssWIFKt0MxIGx
+user2020:29iuqg8zJiiEW
diff --git a/pig/dump.pig b/pig/dump.pig
new file mode 100644
index 0000000..0359f1a
--- /dev/null
+++ b/pig/dump.pig
@@ -0,0 +1,3 @@
+A = load 'data.file' using PigStorage(':');
+B = foreach A generate $0 as id;
+dump B;
diff --git a/pig/filter-cdx-join-urls.pig b/pig/filter-cdx-join-urls.pig
new file mode 100644
index 0000000..3d06804
--- /dev/null
+++ b/pig/filter-cdx-join-urls.pig
@@ -0,0 +1,43 @@
+
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
+%default INPUT_CDX ''
+%default INPUT_URLS ''
+%default OUTPUT ''
+
+REGISTER /home/webcrawl/pig-scripts/jars/ia-web-commons-jar-with-dependencies-CDH3.jar;
+REGISTER /home/webcrawl/pig-scripts/jars/pigtools.jar;
+DEFINE SURTURL pigtools.SurtUrlKey();
+
+set mapreduce.job.queuename default
+
+urls = LOAD '$INPUT_URLS' USING PigStorage() AS url:chararray;
+surts = FOREACH urls GENERATE SURTURL(url) AS url_surt;
+surts = ORDER surts by url_surt ASC PARALLEL 10;
+surts = DISTINCT surts;
+
+cdx = LOAD '$INPUT_CDX' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
+cdx = FILTER cdx BY not cdx_surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*pdf.*';
+
+-- Core JOIN
+full_join = JOIN cdx BY cdx_surt, surts BY url_surt;
+
+-- DISTINCT by sha1 column
+full_uniq = FOREACH (GROUP full_join BY sha1sum) {
+ r = TOP(1, 0, $1);
+ GENERATE FLATTEN(r);
+};
+
+result = FOREACH full_uniq GENERATE cdxline;
+result = DISTINCT result;
+
+STORE result INTO '$OUTPUT' USING PigStorage();