aboutsummaryrefslogtreecommitdiffstats
path: root/pig/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-08 06:21:29 +0000
committerBryan Newbold <bnewbold@archive.org>2018-05-08 06:21:29 +0000
commite566ee1b4e134bfc06284cf77d8d1370df30d53f (patch)
treef3969054cc5f93608b5c72d41541ea381ef89a6b /pig/tests
parent0c398392aa298d28694bf5bd37d3e4912de8a2f5 (diff)
parent65b7d45852af3de557eaaf200471ff9b1a211970 (diff)
downloadsandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.tar.gz
sandcrawler-e566ee1b4e134bfc06284cf77d8d1370df30d53f.zip
Merge branch 'master' of git.archive.org:webgroup/sandcrawler
Diffstat (limited to 'pig/tests')
-rw-r--r--pig/tests/files/papers_domain_words.cdx11
-rw-r--r--pig/tests/files/papers_edu_tilde.cdx15
-rw-r--r--pig/tests/files/papers_url_doi.cdx7
-rw-r--r--pig/tests/files/papers_url_words.cdx27
-rw-r--r--pig/tests/pighelper.py14
-rw-r--r--pig/tests/test_filter_cdx_paper_pdfs.py26
6 files changed, 97 insertions, 3 deletions
diff --git a/pig/tests/files/papers_domain_words.cdx b/pig/tests/files/papers_domain_words.cdx
new file mode 100644
index 0000000..48e2313
--- /dev/null
+++ b/pig/tests/files/papers_domain_words.cdx
@@ -0,0 +1,11 @@
+#http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf
+#http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf
+#http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf
+#http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf
+
+# should match 4:
+
+edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/files/papers_edu_tilde.cdx b/pig/tests/files/papers_edu_tilde.cdx
new file mode 100644
index 0000000..47ca069
--- /dev/null
+++ b/pig/tests/files/papers_edu_tilde.cdx
@@ -0,0 +1,15 @@
+#http://www.stanford.edu:80/~johntayl/Papers/taylor2.pdf
+#http://met.nps.edu/~mtmontgo/papers/isabel_part2.pdf
+#http://www.pitt.edu:80/~druzdzel/psfiles/ecai06.pdf
+#http://www.comp.hkbu.edu.hk/~ymc/papers/conference/ijcnn03_710.pdf
+
+# should be 6 matches:
+hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,stanford,www)/~johntayl/Papers/taylor2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+jp,ac,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+
+# NOT:
+com,corp,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/files/papers_url_doi.cdx b/pig/tests/files/papers_url_doi.cdx
new file mode 100644
index 0000000..1ad5792
--- /dev/null
+++ b/pig/tests/files/papers_url_doi.cdx
@@ -0,0 +1,7 @@
+#http://journals.ametsoc.org/doi/pdf/10.1175/2008BAMS2370.1
+#http://www.nejm.org:80/doi/pdf/10.1056/NEJMoa1013607
+
+# should match 2:
+
+org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/files/papers_url_words.cdx b/pig/tests/files/papers_url_words.cdx
new file mode 100644
index 0000000..e9bf661
--- /dev/null
+++ b/pig/tests/files/papers_url_words.cdx
@@ -0,0 +1,27 @@
+#http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf
+#http://files.eric.ed.gov/fulltext/EJ798626.pdf
+#http://www.hbs.edu/research/pdf/10-108.pdf
+#http://www.unifr.ch/biochem/assets/files/albrecht/publications/Abraham06.pdf
+#http://www.cnbc.cmu.edu/cns/papers/Kassetal2005.pdf
+#http://www.macrothink.org/journal/index.php/ijhrs/article/download/5765/4663
+#http://www.pims.math.ca:80/science/2004/fpsac/Papers/Liskovets.pdf
+#http://www.risc.uni-linz.ac.at/publications/download/risc_3287/synasc_revised.pdf
+#http://softsys.cs.uoi.gr/dbglobe/publications/wi04.pdf
+#http://lexikos.journals.ac.za/pub/article/download/1048/564
+#http://www.siam.org/proceedings/analco/2007/anl07_029ecesaratto.pdf
+#http://www.cs.bris.ac.uk/Publications/Papers/2000249.pdf
+
+# 12 matches:
+
+uk,ac,surrey,ee,personal)/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+gov,ed,eric,files)/fulltext/EJ798626.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,hbs,www)/research/pdf/10-108.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+ch,unifr,www)/biochem/assets/files/albrecht/publications/Abraham06.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+edu,cmu,cnbc,www)/cns/papers/Kassetal2005.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,macrothink,www)/journal/index.php/ijhrs/article/download/5765/4663 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+ca,math,pims,www)/science/2004/fpsac/Papers/Liskovets.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+at,ac,uni-linz,risc,www)/publications/download/risc_3287/synasc_revised.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+gr,uoi,cs,softsys)/dbglobe/publications/wi04.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+za,ac,journals,lexikos)/pub/article/download/1048/564 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+org,siam,www)/proceedings/analco/2007/anl07_029ecesaratto.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
+uk,ac,bris,cs,www)/Publications/Papers/2000249.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/pighelper.py b/pig/tests/pighelper.py
index d0d89d2..4aa4259 100644
--- a/pig/tests/pighelper.py
+++ b/pig/tests/pighelper.py
@@ -36,7 +36,8 @@ class PigTestHelper(unittest.TestCase):
self._tmpdir = tempfile.mkdtemp()
def tearDown(self):
- shutil.rmtree(self._tmpdir)
+ pass
+ # XXX: shutil.rmtree(self._tmpdir)
def run_pig_raw(self, params):
"""Low-level variant with params appended directly. Returns
@@ -67,7 +68,14 @@ class PigTestHelper(unittest.TestCase):
'-p', 'INPUT={}'.format(in_file),
'-p', 'OUTPUT={}'.format(out_file),
] + pargs
- self.run_pig_raw(params)
- return out_file
+ status = self.run_pig_raw(params)
+ assert status.returncode == 0
+ # Capture all the part-r-* files
+ print("out_file: {}".format(out_file))
+ subprocess.run("/bin/ls -la {}/part-*".format(out_file), shell=True)
+ sub = subprocess.run("/bin/cat {}/part-*".format(out_file), stdout=subprocess.PIPE, shell=True)
+ out = sub.stdout.decode('utf-8')
+ print(out)
+ return out
# TODO: helper to verify that output matches an expected file
diff --git a/pig/tests/test_filter_cdx_paper_pdfs.py b/pig/tests/test_filter_cdx_paper_pdfs.py
new file mode 100644
index 0000000..a8ebd9f
--- /dev/null
+++ b/pig/tests/test_filter_cdx_paper_pdfs.py
@@ -0,0 +1,26 @@
+
+import os
+import unittest
+from pighelper import PigTestHelper
+
+def count_lines(s):
+ return len([l for l in s.strip().split('\n') if len(l) > 0])
+
+class TestFilterCDXPaperPdfs(PigTestHelper):
+
+ def test_papers_domain_words(self):
+ r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_domain_words.cdx")
+ assert count_lines(r) == 4
+
+ def test_papers_edu_tilde(self):
+ r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_edu_tilde.cdx")
+ assert count_lines(r) == 6
+
+ def test_papers_url_doi(self):
+ r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_doi.cdx")
+ assert count_lines(r) == 2
+
+ def test_papers_url_words(self):
+ r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_words.cdx")
+ assert count_lines(r) == 12
+