aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_pushers.py
blob: 52f26c0df2e0e4b670dba6dd1f9cc23cec40cf11 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

import pytest

from sandcrawler.workers import CdxLinePusher, BlackholeSink


def test_cdx_line_pusher():

    sink = BlackholeSink()

    # vanilla (only default filters)
    with open('tests/files/example.cdx', 'r') as cdx_file:
        pusher = CdxLinePusher(sink, cdx_file)
        counts = pusher.run()
    assert counts['total'] == 20
    assert counts['skip-parse'] == 1
    assert counts['pushed'] == 19

    # HTTP 200 and application/pdf
    with open('tests/files/example.cdx', 'r') as cdx_file:
        pusher = CdxLinePusher(sink, cdx_file,
            filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
        counts = pusher.run()
    assert counts['total'] == 20
    assert counts['skip-parse'] == 1
    assert counts['skip-http_status'] == 10
    assert counts['skip-mimetype'] == 2
    assert counts['pushed'] == 7