aboutsummaryrefslogtreecommitdiffstats
path: root/pig/tests/test_filter_cdx_paper_pdfs.py
blob: a8ebd9ff5421d8457368411f87401fb62dd46780 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

import os
import unittest
from pighelper import PigTestHelper

def count_lines(s):
    return len([l for l in s.strip().split('\n') if len(l) > 0])

class TestFilterCDXPaperPdfs(PigTestHelper):

    def test_papers_domain_words(self):
        r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_domain_words.cdx")
        assert count_lines(r) == 4

    def test_papers_edu_tilde(self):
        r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_edu_tilde.cdx")
        assert count_lines(r) == 6

    def test_papers_url_doi(self):
        r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_doi.cdx")
        assert count_lines(r) == 2

    def test_papers_url_words(self):
        r = self.run_pig("filter-cdx-paper-pdfs.pig", "tests/files/papers_url_words.cdx")
        assert count_lines(r) == 12