aboutsummaryrefslogtreecommitdiffstats
path: root/pig
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-03-29 22:04:39 -0700
committerBryan Newbold <bnewbold@archive.org>2018-03-29 22:04:39 -0700
commit2d85e1cce20bea15595a70f2d1fb303e95ca5d0f (patch)
tree7c721fdf07fb5469daed5598426a1ddb724309b1 /pig
parent67e0a765749a4754ed353fe30c8e771d136322a4 (diff)
downloadsandcrawler-2d85e1cce20bea15595a70f2d1fb303e95ca5d0f.tar.gz
sandcrawler-2d85e1cce20bea15595a70f2d1fb303e95ca5d0f.zip
progress on pig tests
Diffstat (limited to 'pig')
-rw-r--r--pig/.gitignore2
-rw-r--r--pig/Pipfile2
-rw-r--r--pig/Pipfile.lock12
-rw-r--r--pig/README.md2
-rwxr-xr-xpig/fetch_deps.sh19
-rw-r--r--pig/tests/files/example.cdx20
-rw-r--r--pig/tests/pighelper.py69
-rw-r--r--pig/tests/test_filter_cdx.py11
8 files changed, 127 insertions, 10 deletions
diff --git a/pig/.gitignore b/pig/.gitignore
new file mode 100644
index 0000000..504f811
--- /dev/null
+++ b/pig/.gitignore
@@ -0,0 +1,2 @@
+deps
+*.log
diff --git a/pig/Pipfile b/pig/Pipfile
index af1a0e0..dbdef21 100644
--- a/pig/Pipfile
+++ b/pig/Pipfile
@@ -17,4 +17,4 @@ nose = "*"
[requires]
-python_version = "2.7"
+python_version = "3.5"
diff --git a/pig/Pipfile.lock b/pig/Pipfile.lock
index 2ae4d69..3ac834f 100644
--- a/pig/Pipfile.lock
+++ b/pig/Pipfile.lock
@@ -1,24 +1,24 @@
{
"_meta": {
"hash": {
- "sha256": "8a7f3e832d1c7a39918cabd60145d566e66b48ca8f7ada59b0128a28a8096398"
+ "sha256": "6591f4ebe3f7ad124ef9bae389cd9c11b2fe1d6936bf65c4cec64cb016ef0661"
},
"host-environment-markers": {
"implementation_name": "cpython",
- "implementation_version": "0",
+ "implementation_version": "3.5.3",
"os_name": "posix",
"platform_machine": "x86_64",
"platform_python_implementation": "CPython",
"platform_release": "4.9.0-6-amd64",
"platform_system": "Linux",
"platform_version": "#1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02)",
- "python_full_version": "2.7.13",
- "python_version": "2.7",
- "sys_platform": "linux2"
+ "python_full_version": "3.5.3",
+ "python_version": "3.5",
+ "sys_platform": "linux"
},
"pipfile-spec": 6,
"requires": {
- "python_version": "2.7"
+ "python_version": "3.5"
},
"sources": [
{
diff --git a/pig/README.md b/pig/README.md
index e47e31d..c518591 100644
--- a/pig/README.md
+++ b/pig/README.md
@@ -13,7 +13,7 @@ https://hub.docker.com/r/chalimartines/local-pig
ln -s pig-0.12.0-cdh5.0.1/pig-0.12.0-cdh5.0.1.jar pig-0.12.0-cdh5.0.1/pig.jar
./pig-*/bin/pig -x local -version
- #XXX:
+ #XXX: don't need Hadoop?
#wget https://archive.cloudera.com/cdh5/cdh/5/hadoop-2.3.0-cdh5.0.1.tar.gz
#tar xvf hadoop-*.tar.gz
#export HADOOP_HOME=hadoop-2.3*
diff --git a/pig/fetch_deps.sh b/pig/fetch_deps.sh
new file mode 100755
index 0000000..529f8f7
--- /dev/null
+++ b/pig/fetch_deps.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# If you change this, also update tests/pighelper.py
+PIG_VERSION="0.12.0-cdh5.0.1"
+
+mkdir -p deps/
+cd deps/
+wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz
+tar xvf pig-${PIG_VERSION}.tar.gz
+ln -fs pig-${PIG_VERSION} pig
+cd pig
+ln -fs pig-${PIG_VERSION}.jar pig.jar
+cd ..
+
+JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")
+./pig/bin/pig -x local -version
+
diff --git a/pig/tests/files/example.cdx b/pig/tests/files/example.cdx
new file mode 100644
index 0000000..84e3271
--- /dev/null
+++ b/pig/tests/files/example.cdx
@@ -0,0 +1,20 @@
+edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.pdf 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pdf 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pdf application/pdf 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,ifaamas)/proceedings/aamas09/pdf/01_full%20papers/02_08_fp_0272.pdf 20170706081902 http://www.ifaamas.org/Proceedings/aamas09/pdf/01_Full%20Papers/02_08_FP_0272.pdf application/pdf 200 GYHX35QJWRJELWJ5GDQZPTPOUUZOCTKF - - 251180 34635154 CITESEERX-CRAWL-2017-06-20-20170706081825105-00419-00428-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706081838210-00420-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+de,fau,cs)/publications/2014/lukas_14_masterthesis.pdf 20170705101722 http://www4.cs.fau.de/Publications/2014/lukas_14_masterthesis.pdf application/pdf 200 GIUQT7SXZ33TWEFBM2MWURJI2M3QE3IW - - 1290532 71068435 CITESEERX-CRAWL-2017-06-20-20170705101605019-00279-00288-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705101714659-00281-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+de,bund,jki,pub)/index.php/jabfq/article/download/3568/4462 20170706041152 http://pub.jki.bund.de/index.php/JABFQ/article/download/3568/4462/ text/html 301 XZBNO24W2ZPQQMJYE6YUUCSRUF7G3ZBT - - 552 417292708 CITESEERX-CRAWL-2017-06-20-20170706040506112-00160-00169-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706041021844-00165-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+whois://whois.arin.net/z+%2B+132.177.133.114 20170713120653 whois://whois.arin.net/z+%2B+132.177.133.114 text/plain - IDEID4YQ6MVJSOE57NPVDLL53ZB3J4DX - - 876 30983517 CITESEERX-CRAWL-2017-06-20-20170707064626094-01007-01015-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170711214025652-01014-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+za,co,csir,researchspace)/dspace/bitstream/10204/4048/1/smith2_2010.pdf 20170706094159 http://researchspace.csir.co.za/dspace/bitstream/10204/4048/1/Smith2_2010.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 104830407 CITESEERX-CRAWL-2017-06-20-20170706093829986-00509-00518-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706094137978-00512-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,annals)/article.aspx?articleid=705034 20170707013120 http://annals.org/article.aspx?articleid=705034 text/html 301 QQYKL57QSERLFM3LXSWMNOFXMOCN7C5G - - 22665 28113974 CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-00976-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,annals)/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf 20170707045304 http://annals.org/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf text/html 302 423S7EMGLCVIZ3FLVD7TLAG75HWE4RGI - - 644 222908628 CITESEERX-CRAWL-2017-06-20-20170707042504366-00997-01006-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707045044604-00999-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+com,sagepub,spi)/content/28/4/501.full.pdf 20170705092027 http://spi.sagepub.com/content/28/4/501.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 396 553180242 CITESEERX-CRAWL-2017-06-20-20170705091311851-00219-00228-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705091759818-00223-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+ir,mediaj)/favicon.ico 20170705075240 http://mediaj.ir/favicon.ico text/html 404 E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK - - 589 455827180 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705075051100-00135-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+com,sagepub,jpr)/content/8/3-4/239.full.pdf 20170705074931 http://jpr.sagepub.com/content/8/3-4/239.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 400 270368088 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+jp,co,nittuden)/business/pdf/transparent_thermoplastic_resin_with_electron_beam_cross-linking.pdf 20170706083459 http://www.nittuden.co.jp/business/pdf/Transparent_Thermoplastic_Resin_with_Electron_Beam_Cross-Linking.pdf application/pdf 200 V32E3CCO7NMI2M4OHLKG73DXD72LR4B2 - - 715081 761088410 CITESEERX-CRAWL-2017-06-20-20170706082646066-00429-00438-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706083257353-00436-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+lt,lms)/robots.txt 20170705122708 http://www.lms.lt/robots.txt text/plain 200 PF3HTQQT2ULYRWFLJGUWZKHTVZUVMZ2F - - 592 668333707 CITESEERX-CRAWL-2017-06-20-20170705121748408-00399-00408-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705122352502-00406-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+hu,bme,phy)/~szalay/pub/multipartcriteriaposter.pdf 20170705124828 http://www.phy.bme.hu/%7Eszalay/pub/multipartcriteriaPoster.pdf application/pdf 200 L3TUEEZLBJTHAVH74B5N426FAIDBCCOE - - 187866 964760782 CITESEERX-CRAWL-2017-06-20-20170705123641979-00419-00428-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705124315591-00426-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,adb,openaccess)/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf;jsessionid=f966a3bdac9882ec5a7c326b130f6f81?sequence=1 20170705090940 https://openaccess.adb.org/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf%3Bjsessionid%3DF966A3BDAC9882EC5A7C326B130F6F81?sequence%3D1 unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 515 634039376 CITESEERX-CRAWL-2017-06-20-20170705090333400-00209-00218-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705090728803-00212-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,physiology,ajpregu)/content/272/4/r1084 20170706131006 http://ajpregu.physiology.org/content/272/4/R1084 text/html 200 3FOQSKT4WBYOUA6VKKJCEQCN6QF35ANT - - 27346 336293585 CITESEERX-CRAWL-2017-06-20-20170706130432396-00707-00716-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706130850866-00711-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+de,desy,www-it)/common/documentation/cd-docs/sc2002/paperpdf/pap234.pdf 20170705121813 http://www-it.desy.de/common/documentation/cd-docs/SC2002/paperpdf/pap234.pdf application/pdf 200 BONCZ4NNGRNYR22ASFVU7VYTQ24RRNP4 - - 72421 381715704 CITESEERX-CRAWL-2017-06-20-20170705120827801-00389-00398-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705121708700-00397-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,oxfordjournals,bmb)/content/28/3/247.full.pdf 20170706014948 http://bmb.oxfordjournals.org/content/28/3/247.full.pdf text/html 301 EJWYVOPONJRARK7SGG6COFRN7CSTHROY - - 643 119398161 CITESEERX-CRAWL-2017-06-20-20170706014800946-00020-00029-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706014907678-00022-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
diff --git a/pig/tests/pighelper.py b/pig/tests/pighelper.py
new file mode 100644
index 0000000..cee074d
--- /dev/null
+++ b/pig/tests/pighelper.py
@@ -0,0 +1,69 @@
+"""
+A helper class for locally testing Pig scripts.
+
+author: Bryan Newbold <bnewbold@archive.org>
+"""
+import os
+import tempfile
+import unittest
+import subprocess
+from nose.tools import *
+
+
+class PigTestHelper(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+
+ cls._pigpath= "./deps/pig/bin/pig"
+ cls._base = [cls._pigpath,
+ '-x', 'local',
+ '-log4jconf', 'pig_log4j.properties',
+ '-stop_on_failure']
+
+ # Check that pig is functioning
+ if subprocess.call(cls._base + ['-version']) != 0:
+ raise unittest.SkipTest("Failed to find and run Pig")
+
+ # Classpath?
+ # os.path.join("pig-0.12.0-cdh5.0.1", "pig.jar"),
+ # os.path.join("pig-0.12.0-cdh5.0.1", "lib", "*"),
+ # "hadoop-2.3.0-cdh5.0.1"
+
+ def setUp(self):
+ self._tmpdir = tempfile.mkdtemp()
+
+ def tearDown(self):
+ os.rmdir(self._tmpdir)
+
+ def run_pig_raw(self, params):
+ """Low-level variant with params appended directly. Returns
+ CompletedProcess, raises an error if return value isn't succes"""
+
+ retval = subprocess.run(self._base + params,
+ timeout=20.0,
+ check=True)
+ return retval
+
+ def run_pig(self, script_path, in_file, **kwargs):
+ """Convenience helper around run_pig().
+
+ INPUT parameter is set to in_file.
+ OUTPUT parameter is set to a random file.
+ Any keyword args are passed as parameters.
+ """
+
+ pargs = []
+ for key, value in kwargs.items():
+ pargs.append('-p')
+ pargs.append('{}={}'.format(key, value))
+
+ out_file = tempfile.mktemp(dir=self._tmpdir)
+ params = [
+ '-f', script_path,
+ '-p', 'INPUT={}'.format(in_file),
+ '-p', 'OUTPUT={}'.format(out_file),
+ ] + pargs
+ self.run_pig_raw(params)
+ return out_file
+
diff --git a/pig/tests/test_filter_cdx.py b/pig/tests/test_filter_cdx.py
index 83f88bb..f46e5e1 100644
--- a/pig/tests/test_filter_cdx.py
+++ b/pig/tests/test_filter_cdx.py
@@ -10,9 +10,9 @@ Abstract into a base test class/template:
import os
import unittest
from nose.tools import *
-from pigpy.hadoop import Hadoop
-
+from pighelper import PigTestHelper
+"""
class TestFilterCDX(unittest.TestCase):
def setUp(self):
@@ -33,3 +33,10 @@ class TestFilterCDX(unittest.TestCase):
self.hadoop.run_pig_job("filter-cdx-ps.pig")
self.hadoop.copyToLocal("/reports/output.csv", "output.csv")
+"""
+
+class TestFilterCDX(PigTestHelper):
+
+ def test_thing(self):
+
+ self.run_pig("filter-cdx-ps.pig", "tests/files/example.cdx")