diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-03-29 21:50:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-03-29 21:50:06 -0700 |
commit | 67e0a765749a4754ed353fe30c8e771d136322a4 (patch) | |
tree | 1e8d656ecc6f4830e5a3e787ba099f871a8137fa | |
parent | cb1582c44a000983a2150679c51b1baf22c09778 (diff) | |
download | sandcrawler-67e0a765749a4754ed353fe30c8e771d136322a4.tar.gz sandcrawler-67e0a765749a4754ed353fe30c8e771d136322a4.zip |
import WIP on pig test setup
-rw-r--r-- | pig/Pipfile | 20 | ||||
-rw-r--r-- | pig/Pipfile.lock | 48 | ||||
-rw-r--r-- | pig/README.md | 28 | ||||
-rw-r--r-- | pig/filter-cdx-ps.pig | 18 | ||||
-rw-r--r-- | pig/pig_log4j.properties | 7 | ||||
-rw-r--r-- | pig/tests/test_filter_cdx.py | 35 |
6 files changed, 156 insertions, 0 deletions
diff --git a/pig/Pipfile b/pig/Pipfile new file mode 100644 index 0000000..af1a0e0 --- /dev/null +++ b/pig/Pipfile @@ -0,0 +1,20 @@ +[[source]] + +url = "https://pypi.python.org/simple" +verify_ssl = true +name = "pypi" + + +[dev-packages] + + + +[packages] + +pigpy = "*" +nose = "*" + + +[requires] + +python_version = "2.7" diff --git a/pig/Pipfile.lock b/pig/Pipfile.lock new file mode 100644 index 0000000..2ae4d69 --- /dev/null +++ b/pig/Pipfile.lock @@ -0,0 +1,48 @@ +{ + "_meta": { + "hash": { + "sha256": "8a7f3e832d1c7a39918cabd60145d566e66b48ca8f7ada59b0128a28a8096398" + }, + "host-environment-markers": { + "implementation_name": "cpython", + "implementation_version": "0", + "os_name": "posix", + "platform_machine": "x86_64", + "platform_python_implementation": "CPython", + "platform_release": "4.9.0-6-amd64", + "platform_system": "Linux", + "platform_version": "#1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02)", + "python_full_version": "2.7.13", + "python_version": "2.7", + "sys_platform": "linux2" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "2.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.python.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "nose": { + "hashes": [ + "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", + "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", + "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" + ], + "version": "==1.3.7" + }, + "pigpy": { + "hashes": [ + "sha256:89f91f07b95a2f84dda28159f8479209d50498d3aef7ff96f653345cbec09c96" + ], + "version": "==0.7" + } + }, + "develop": {} +} diff --git a/pig/README.md b/pig/README.md new file mode 100644 index 0000000..e47e31d --- /dev/null +++ b/pig/README.md @@ -0,0 +1,28 @@ + +As of March 2018, the archive runs Pig version 0.12.0, via CDH5.0.1 (Cloudera +Distribution). + +## Development and Testing + +To run pig in development on your laptop, you can either use docker or + +https://hub.docker.com/r/chalimartines/local-pig + + wget https://archive.cloudera.com/cdh5/cdh/5/pig-0.12.0-cdh5.0.1.tar.gz + tar xvf pig-*.tar.gz + ln -s pig-0.12.0-cdh5.0.1/pig-0.12.0-cdh5.0.1.jar pig-0.12.0-cdh5.0.1/pig.jar + ./pig-*/bin/pig -x local -version + + #XXX: + #wget https://archive.cloudera.com/cdh5/cdh/5/hadoop-2.3.0-cdh5.0.1.tar.gz + #tar xvf hadoop-*.tar.gz + #export HADOOP_HOME=hadoop-2.3* + +Tests require python3, nosetests3, and pigpy. You can install these with: + + pip install pipenv + pipenv install --three + +Then: + + pipenv run nosetests3 diff --git a/pig/filter-cdx-ps.pig b/pig/filter-cdx-ps.pig new file mode 100644 index 0000000..6e80acc --- /dev/null +++ b/pig/filter-cdx-ps.pig @@ -0,0 +1,18 @@ +%default INPUT '' +%default OUTPUT '' + +set mapreduce.job.queuename default + +cdx = LOAD '$INPUT' AS cdxline:chararray; +cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc'); +cdx = FILTER cdx BY not STARTSWITH (cdxline, ' '); + +cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline; +cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline; +cdx = FILTER cdx BY not url matches '-'; +cdx = FILTER cdx BY httpstatus matches '200'; +cdx = FILTER cdx BY mimetype matches '.*postscript.*'; +cdx = ORDER cdx by url, timestamp PARALLEL 50; +cdx = FOREACH cdx GENERATE cdxline; +STORE cdx INTO '$OUTPUT' USING PigStorage(' '); + diff --git a/pig/pig_log4j.properties b/pig/pig_log4j.properties new file mode 100644 index 0000000..a64a19b --- /dev/null +++ b/pig/pig_log4j.properties @@ -0,0 +1,7 @@ +log4j.rootLogger=WARN, A1 +log4j.appender.A1=org.apache.log4j.ConsoleAppender +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n + +log4j.logger.org.apache.pig=WARN, A1 +log4j.logger.org.apache.hadoop = WARN, A1 diff --git a/pig/tests/test_filter_cdx.py b/pig/tests/test_filter_cdx.py new file mode 100644 index 0000000..83f88bb --- /dev/null +++ b/pig/tests/test_filter_cdx.py @@ -0,0 +1,35 @@ + +""" +Abstract into a base test class/template: + +1. Needs deps downloaded and installed and env configured (bash? .env? makefile?) +2. In test, create tempdir for output. Print helpful info on every run +3. Run pig locally, inspect output files +""" + +import os +import unittest +from nose.tools import * +from pigpy.hadoop import Hadoop + + +class TestFilterCDX(unittest.TestCase): + + def setUp(self): + + classpaths = [ + os.path.join("pig-0.12.0-cdh5.0.1", "pig.jar"), + os.path.join("pig-0.12.0-cdh5.0.1", "lib", "*"), + ] + + local_home = os.path.join("hadoop-2.3.0-cdh5.0.1") + + name_node = "file:///test/files" + + self.hadoop = Hadoop(local_home, name_node, classpaths) + + def test_thing(self): + + self.hadoop.run_pig_job("filter-cdx-ps.pig") + self.hadoop.copyToLocal("/reports/output.csv", "output.csv") + |