diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-05-07 22:10:51 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-07 22:11:18 -0700 |
commit | d1401444dbfb515e62094f873d520a23ccbc29d9 (patch) | |
tree | 418a21b93261230b006127107b124e5c12236ab7 /pig/tests/test_filter_cdx_paper_pdfs.py | |
parent | 81d2f6290fff487f0f49b109227443c0f8a7aedb (diff) | |
download | sandcrawler-d1401444dbfb515e62094f873d520a23ccbc29d9.tar.gz sandcrawler-d1401444dbfb515e62094f873d520a23ccbc29d9.zip |
pig script to filter GWB CDX by SURT regexes
Diffstat (limited to 'pig/tests/test_filter_cdx_paper_pdfs.py')
-rw-r--r-- | pig/tests/test_filter_cdx_paper_pdfs.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/pig/tests/test_filter_cdx_paper_pdfs.py b/pig/tests/test_filter_cdx_paper_pdfs.py new file mode 100644 index 0000000..a8ebd9f --- /dev/null +++ b/pig/tests/test_filter_cdx_paper_pdfs.py @@ -0,0 +1,26 @@ + +import os +import unittest +from pighelper import PigTestHelper + +def count_lines(s): + return len([l for l in s.strip().split('\n') if len(l) > 0]) + +class TestFilterCDXPaperPdfs(PigTestHelper): + + def test_papers_domain_words(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_domain_words.cdx") + assert count_lines(r) == 4 + + def test_papers_edu_tilde(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_edu_tilde.cdx") + assert count_lines(r) == 6 + + def test_papers_url_doi(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_doi.cdx") + assert count_lines(r) == 2 + + def test_papers_url_words(self): + r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_words.cdx") + assert count_lines(r) == 12 + |