aboutsummaryrefslogtreecommitdiffstats
path: root/pig
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-03-29 21:50:06 -0700
committerBryan Newbold <bnewbold@archive.org>2018-03-29 21:50:06 -0700
commit67e0a765749a4754ed353fe30c8e771d136322a4 (patch)
tree1e8d656ecc6f4830e5a3e787ba099f871a8137fa /pig
parentcb1582c44a000983a2150679c51b1baf22c09778 (diff)
downloadsandcrawler-67e0a765749a4754ed353fe30c8e771d136322a4.tar.gz
sandcrawler-67e0a765749a4754ed353fe30c8e771d136322a4.zip
import WIP on pig test setup
Diffstat (limited to 'pig')
-rw-r--r--pig/Pipfile20
-rw-r--r--pig/Pipfile.lock48
-rw-r--r--pig/README.md28
-rw-r--r--pig/filter-cdx-ps.pig18
-rw-r--r--pig/pig_log4j.properties7
-rw-r--r--pig/tests/test_filter_cdx.py35
6 files changed, 156 insertions, 0 deletions
diff --git a/pig/Pipfile b/pig/Pipfile
new file mode 100644
index 0000000..af1a0e0
--- /dev/null
+++ b/pig/Pipfile
@@ -0,0 +1,20 @@
+[[source]]
+
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+
+[dev-packages]
+
+
+
+[packages]
+
+pigpy = "*"
+nose = "*"
+
+
+[requires]
+
+python_version = "2.7"
diff --git a/pig/Pipfile.lock b/pig/Pipfile.lock
new file mode 100644
index 0000000..2ae4d69
--- /dev/null
+++ b/pig/Pipfile.lock
@@ -0,0 +1,48 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "8a7f3e832d1c7a39918cabd60145d566e66b48ca8f7ada59b0128a28a8096398"
+ },
+ "host-environment-markers": {
+ "implementation_name": "cpython",
+ "implementation_version": "0",
+ "os_name": "posix",
+ "platform_machine": "x86_64",
+ "platform_python_implementation": "CPython",
+ "platform_release": "4.9.0-6-amd64",
+ "platform_system": "Linux",
+ "platform_version": "#1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02)",
+ "python_full_version": "2.7.13",
+ "python_version": "2.7",
+ "sys_platform": "linux2"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "2.7"
+ },
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.python.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "nose": {
+ "hashes": [
+ "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a",
+ "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac",
+ "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98"
+ ],
+ "version": "==1.3.7"
+ },
+ "pigpy": {
+ "hashes": [
+ "sha256:89f91f07b95a2f84dda28159f8479209d50498d3aef7ff96f653345cbec09c96"
+ ],
+ "version": "==0.7"
+ }
+ },
+ "develop": {}
+}
diff --git a/pig/README.md b/pig/README.md
new file mode 100644
index 0000000..e47e31d
--- /dev/null
+++ b/pig/README.md
@@ -0,0 +1,28 @@
+
+As of March 2018, the archive runs Pig version 0.12.0, via CDH5.0.1 (Cloudera
+Distribution).
+
+## Development and Testing
+
+To run pig in development on your laptop, you can either use docker or
+
+https://hub.docker.com/r/chalimartines/local-pig
+
+ wget https://archive.cloudera.com/cdh5/cdh/5/pig-0.12.0-cdh5.0.1.tar.gz
+ tar xvf pig-*.tar.gz
+ ln -s pig-0.12.0-cdh5.0.1/pig-0.12.0-cdh5.0.1.jar pig-0.12.0-cdh5.0.1/pig.jar
+ ./pig-*/bin/pig -x local -version
+
+ #XXX:
+ #wget https://archive.cloudera.com/cdh5/cdh/5/hadoop-2.3.0-cdh5.0.1.tar.gz
+ #tar xvf hadoop-*.tar.gz
+ #export HADOOP_HOME=hadoop-2.3*
+
+Tests require python3, nosetests3, and pigpy. You can install these with:
+
+ pip install pipenv
+ pipenv install --three
+
+Then:
+
+ pipenv run nosetests3
diff --git a/pig/filter-cdx-ps.pig b/pig/filter-cdx-ps.pig
new file mode 100644
index 0000000..6e80acc
--- /dev/null
+++ b/pig/filter-cdx-ps.pig
@@ -0,0 +1,18 @@
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FILTER cdx BY not url matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*postscript.*';
+cdx = ORDER cdx by url, timestamp PARALLEL 50;
+cdx = FOREACH cdx GENERATE cdxline;
+STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
+
diff --git a/pig/pig_log4j.properties b/pig/pig_log4j.properties
new file mode 100644
index 0000000..a64a19b
--- /dev/null
+++ b/pig/pig_log4j.properties
@@ -0,0 +1,7 @@
+log4j.rootLogger=WARN, A1
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
+
+log4j.logger.org.apache.pig=WARN, A1
+log4j.logger.org.apache.hadoop = WARN, A1
diff --git a/pig/tests/test_filter_cdx.py b/pig/tests/test_filter_cdx.py
new file mode 100644
index 0000000..83f88bb
--- /dev/null
+++ b/pig/tests/test_filter_cdx.py
@@ -0,0 +1,35 @@
+
+"""
+Abstract into a base test class/template:
+
+1. Needs deps downloaded and installed and env configured (bash? .env? makefile?)
+2. In test, create tempdir for output. Print helpful info on every run
+3. Run pig locally, inspect output files
+"""
+
+import os
+import unittest
+from nose.tools import *
+from pigpy.hadoop import Hadoop
+
+
+class TestFilterCDX(unittest.TestCase):
+
+ def setUp(self):
+
+ classpaths = [
+ os.path.join("pig-0.12.0-cdh5.0.1", "pig.jar"),
+ os.path.join("pig-0.12.0-cdh5.0.1", "lib", "*"),
+ ]
+
+ local_home = os.path.join("hadoop-2.3.0-cdh5.0.1")
+
+ name_node = "file:///test/files"
+
+ self.hadoop = Hadoop(local_home, name_node, classpaths)
+
+ def test_thing(self):
+
+ self.hadoop.run_pig_job("filter-cdx-ps.pig")
+ self.hadoop.copyToLocal("/reports/output.csv", "output.csv")
+