blob: ed9c0bbedf012d728bb66e37bf1005ca8684dc6f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
import pytest
from sandcrawler.workers import CdxLinePusher, BlackholeSink
def test_cdx_line_pusher():
sink = BlackholeSink()
# vanilla (only default filters)
with open('tests/files/example.cdx', 'r') as cdx_file:
pusher = CdxLinePusher(sink, cdx_file)
counts = pusher.run()
assert counts['total'] == 20
assert counts['skip-parse'] == 1
assert counts['pushed'] == 19
# HTTP 200 and application/pdf
with open('tests/files/example.cdx', 'r') as cdx_file:
pusher = CdxLinePusher(sink, cdx_file,
filter_mimetypes=['application/pdf'], filter_http_statuses=[200])
counts = pusher.run()
assert counts['total'] == 20
assert counts['skip-parse'] == 1
assert counts['skip-http_status'] == 10
assert counts['skip-mimetype'] == 2
assert counts['pushed'] == 7
|