diff options
-rw-r--r-- | pig/.gitignore | 2 | ||||
-rw-r--r-- | pig/Pipfile | 2 | ||||
-rw-r--r-- | pig/Pipfile.lock | 12 | ||||
-rw-r--r-- | pig/README.md | 2 | ||||
-rwxr-xr-x | pig/fetch_deps.sh | 19 | ||||
-rw-r--r-- | pig/tests/files/example.cdx | 20 | ||||
-rw-r--r-- | pig/tests/pighelper.py | 69 | ||||
-rw-r--r-- | pig/tests/test_filter_cdx.py | 11 |
8 files changed, 127 insertions, 10 deletions
diff --git a/pig/.gitignore b/pig/.gitignore new file mode 100644 index 0000000..504f811 --- /dev/null +++ b/pig/.gitignore @@ -0,0 +1,2 @@ +deps +*.log diff --git a/pig/Pipfile b/pig/Pipfile index af1a0e0..dbdef21 100644 --- a/pig/Pipfile +++ b/pig/Pipfile @@ -17,4 +17,4 @@ nose = "*" [requires] -python_version = "2.7" +python_version = "3.5" diff --git a/pig/Pipfile.lock b/pig/Pipfile.lock index 2ae4d69..3ac834f 100644 --- a/pig/Pipfile.lock +++ b/pig/Pipfile.lock @@ -1,24 +1,24 @@ { "_meta": { "hash": { - "sha256": "8a7f3e832d1c7a39918cabd60145d566e66b48ca8f7ada59b0128a28a8096398" + "sha256": "6591f4ebe3f7ad124ef9bae389cd9c11b2fe1d6936bf65c4cec64cb016ef0661" }, "host-environment-markers": { "implementation_name": "cpython", - "implementation_version": "0", + "implementation_version": "3.5.3", "os_name": "posix", "platform_machine": "x86_64", "platform_python_implementation": "CPython", "platform_release": "4.9.0-6-amd64", "platform_system": "Linux", "platform_version": "#1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02)", - "python_full_version": "2.7.13", - "python_version": "2.7", - "sys_platform": "linux2" + "python_full_version": "3.5.3", + "python_version": "3.5", + "sys_platform": "linux" }, "pipfile-spec": 6, "requires": { - "python_version": "2.7" + "python_version": "3.5" }, "sources": [ { diff --git a/pig/README.md b/pig/README.md index e47e31d..c518591 100644 --- a/pig/README.md +++ b/pig/README.md @@ -13,7 +13,7 @@ https://hub.docker.com/r/chalimartines/local-pig ln -s pig-0.12.0-cdh5.0.1/pig-0.12.0-cdh5.0.1.jar pig-0.12.0-cdh5.0.1/pig.jar ./pig-*/bin/pig -x local -version - #XXX: + #XXX: don't need Hadoop? #wget https://archive.cloudera.com/cdh5/cdh/5/hadoop-2.3.0-cdh5.0.1.tar.gz #tar xvf hadoop-*.tar.gz #export HADOOP_HOME=hadoop-2.3* diff --git a/pig/fetch_deps.sh b/pig/fetch_deps.sh new file mode 100755 index 0000000..529f8f7 --- /dev/null +++ b/pig/fetch_deps.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# If you change this, also update tests/pighelper.py +PIG_VERSION="0.12.0-cdh5.0.1" + +mkdir -p deps/ +cd deps/ +wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz +tar xvf pig-${PIG_VERSION}.tar.gz +ln -fs pig-${PIG_VERSION} pig +cd pig +ln -fs pig-${PIG_VERSION}.jar pig.jar +cd .. + +JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") +./pig/bin/pig -x local -version + diff --git a/pig/tests/files/example.cdx b/pig/tests/files/example.cdx new file mode 100644 index 0000000..84e3271 --- /dev/null +++ b/pig/tests/files/example.cdx @@ -0,0 +1,20 @@ +edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.pdf 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pdf 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pdf application/pdf 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +org,ifaamas)/proceedings/aamas09/pdf/01_full%20papers/02_08_fp_0272.pdf 20170706081902 http://www.ifaamas.org/Proceedings/aamas09/pdf/01_Full%20Papers/02_08_FP_0272.pdf application/pdf 200 GYHX35QJWRJELWJ5GDQZPTPOUUZOCTKF - - 251180 34635154 CITESEERX-CRAWL-2017-06-20-20170706081825105-00419-00428-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706081838210-00420-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +de,fau,cs)/publications/2014/lukas_14_masterthesis.pdf 20170705101722 http://www4.cs.fau.de/Publications/2014/lukas_14_masterthesis.pdf application/pdf 200 GIUQT7SXZ33TWEFBM2MWURJI2M3QE3IW - - 1290532 71068435 CITESEERX-CRAWL-2017-06-20-20170705101605019-00279-00288-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705101714659-00281-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +de,bund,jki,pub)/index.php/jabfq/article/download/3568/4462 20170706041152 http://pub.jki.bund.de/index.php/JABFQ/article/download/3568/4462/ text/html 301 XZBNO24W2ZPQQMJYE6YUUCSRUF7G3ZBT - - 552 417292708 CITESEERX-CRAWL-2017-06-20-20170706040506112-00160-00169-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706041021844-00165-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +whois://whois.arin.net/z+%2B+132.177.133.114 20170713120653 whois://whois.arin.net/z+%2B+132.177.133.114 text/plain - IDEID4YQ6MVJSOE57NPVDLL53ZB3J4DX - - 876 30983517 CITESEERX-CRAWL-2017-06-20-20170707064626094-01007-01015-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170711214025652-01014-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +za,co,csir,researchspace)/dspace/bitstream/10204/4048/1/smith2_2010.pdf 20170706094159 http://researchspace.csir.co.za/dspace/bitstream/10204/4048/1/Smith2_2010.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 104830407 CITESEERX-CRAWL-2017-06-20-20170706093829986-00509-00518-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706094137978-00512-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +org,annals)/article.aspx?articleid=705034 20170707013120 http://annals.org/article.aspx?articleid=705034 text/html 301 QQYKL57QSERLFM3LXSWMNOFXMOCN7C5G - - 22665 28113974 CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-00976-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +org,annals)/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf 20170707045304 http://annals.org/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf text/html 302 423S7EMGLCVIZ3FLVD7TLAG75HWE4RGI - - 644 222908628 CITESEERX-CRAWL-2017-06-20-20170707042504366-00997-01006-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707045044604-00999-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +com,sagepub,spi)/content/28/4/501.full.pdf 20170705092027 http://spi.sagepub.com/content/28/4/501.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 396 553180242 CITESEERX-CRAWL-2017-06-20-20170705091311851-00219-00228-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705091759818-00223-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +ir,mediaj)/favicon.ico 20170705075240 http://mediaj.ir/favicon.ico text/html 404 E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK - - 589 455827180 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705075051100-00135-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +com,sagepub,jpr)/content/8/3-4/239.full.pdf 20170705074931 http://jpr.sagepub.com/content/8/3-4/239.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 400 270368088 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +jp,co,nittuden)/business/pdf/transparent_thermoplastic_resin_with_electron_beam_cross-linking.pdf 20170706083459 http://www.nittuden.co.jp/business/pdf/Transparent_Thermoplastic_Resin_with_Electron_Beam_Cross-Linking.pdf application/pdf 200 V32E3CCO7NMI2M4OHLKG73DXD72LR4B2 - - 715081 761088410 CITESEERX-CRAWL-2017-06-20-20170706082646066-00429-00438-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706083257353-00436-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +lt,lms)/robots.txt 20170705122708 http://www.lms.lt/robots.txt text/plain 200 PF3HTQQT2ULYRWFLJGUWZKHTVZUVMZ2F - - 592 668333707 CITESEERX-CRAWL-2017-06-20-20170705121748408-00399-00408-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705122352502-00406-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +hu,bme,phy)/~szalay/pub/multipartcriteriaposter.pdf 20170705124828 http://www.phy.bme.hu/%7Eszalay/pub/multipartcriteriaPoster.pdf application/pdf 200 L3TUEEZLBJTHAVH74B5N426FAIDBCCOE - - 187866 964760782 CITESEERX-CRAWL-2017-06-20-20170705123641979-00419-00428-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705124315591-00426-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +org,adb,openaccess)/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf;jsessionid=f966a3bdac9882ec5a7c326b130f6f81?sequence=1 20170705090940 https://openaccess.adb.org/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf%3Bjsessionid%3DF966A3BDAC9882EC5A7C326B130F6F81?sequence%3D1 unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 515 634039376 CITESEERX-CRAWL-2017-06-20-20170705090333400-00209-00218-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705090728803-00212-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +org,physiology,ajpregu)/content/272/4/r1084 20170706131006 http://ajpregu.physiology.org/content/272/4/R1084 text/html 200 3FOQSKT4WBYOUA6VKKJCEQCN6QF35ANT - - 27346 336293585 CITESEERX-CRAWL-2017-06-20-20170706130432396-00707-00716-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706130850866-00711-3671~wbgrp-svc285.us.archive.org~8443.warc.gz +de,desy,www-it)/common/documentation/cd-docs/sc2002/paperpdf/pap234.pdf 20170705121813 http://www-it.desy.de/common/documentation/cd-docs/SC2002/paperpdf/pap234.pdf application/pdf 200 BONCZ4NNGRNYR22ASFVU7VYTQ24RRNP4 - - 72421 381715704 CITESEERX-CRAWL-2017-06-20-20170705120827801-00389-00398-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705121708700-00397-31209~wbgrp-svc284.us.archive.org~8443.warc.gz +org,oxfordjournals,bmb)/content/28/3/247.full.pdf 20170706014948 http://bmb.oxfordjournals.org/content/28/3/247.full.pdf text/html 301 EJWYVOPONJRARK7SGG6COFRN7CSTHROY - - 643 119398161 CITESEERX-CRAWL-2017-06-20-20170706014800946-00020-00029-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706014907678-00022-3671~wbgrp-svc285.us.archive.org~8443.warc.gz diff --git a/pig/tests/pighelper.py b/pig/tests/pighelper.py new file mode 100644 index 0000000..cee074d --- /dev/null +++ b/pig/tests/pighelper.py @@ -0,0 +1,69 @@ +""" +A helper class for locally testing Pig scripts. + +author: Bryan Newbold <bnewbold@archive.org> +""" +import os +import tempfile +import unittest +import subprocess +from nose.tools import * + + +class PigTestHelper(unittest.TestCase): + + @classmethod + def setUpClass(cls): + + cls._pigpath= "./deps/pig/bin/pig" + cls._base = [cls._pigpath, + '-x', 'local', + '-log4jconf', 'pig_log4j.properties', + '-stop_on_failure'] + + # Check that pig is functioning + if subprocess.call(cls._base + ['-version']) != 0: + raise unittest.SkipTest("Failed to find and run Pig") + + # Classpath? + # os.path.join("pig-0.12.0-cdh5.0.1", "pig.jar"), + # os.path.join("pig-0.12.0-cdh5.0.1", "lib", "*"), + # "hadoop-2.3.0-cdh5.0.1" + + def setUp(self): + self._tmpdir = tempfile.mkdtemp() + + def tearDown(self): + os.rmdir(self._tmpdir) + + def run_pig_raw(self, params): + """Low-level variant with params appended directly. Returns + CompletedProcess, raises an error if return value isn't succes""" + + retval = subprocess.run(self._base + params, + timeout=20.0, + check=True) + return retval + + def run_pig(self, script_path, in_file, **kwargs): + """Convenience helper around run_pig(). + + INPUT parameter is set to in_file. + OUTPUT parameter is set to a random file. + Any keyword args are passed as parameters. + """ + + pargs = [] + for key, value in kwargs.items(): + pargs.append('-p') + pargs.append('{}={}'.format(key, value)) + + out_file = tempfile.mktemp(dir=self._tmpdir) + params = [ + '-f', script_path, + '-p', 'INPUT={}'.format(in_file), + '-p', 'OUTPUT={}'.format(out_file), + ] + pargs + self.run_pig_raw(params) + return out_file + diff --git a/pig/tests/test_filter_cdx.py b/pig/tests/test_filter_cdx.py index 83f88bb..f46e5e1 100644 --- a/pig/tests/test_filter_cdx.py +++ b/pig/tests/test_filter_cdx.py @@ -10,9 +10,9 @@ Abstract into a base test class/template: import os import unittest from nose.tools import * -from pigpy.hadoop import Hadoop - +from pighelper import PigTestHelper +""" class TestFilterCDX(unittest.TestCase): def setUp(self): @@ -33,3 +33,10 @@ class TestFilterCDX(unittest.TestCase): self.hadoop.run_pig_job("filter-cdx-ps.pig") self.hadoop.copyToLocal("/reports/output.csv", "output.csv") +""" + +class TestFilterCDX(PigTestHelper): + + def test_thing(self): + + self.run_pig("filter-cdx-ps.pig", "tests/files/example.cdx") |