blob: ed17d24baefe02f9bc3212670bbc33593e7fcd8a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
from sandcrawler.workers import BlackholeSink, CdxLinePusher
def test_cdx_line_pusher():
sink = BlackholeSink()
# vanilla (only default filters)
with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(sink, cdx_file)
counts = pusher.run()
assert counts["total"] == 20
assert counts["skip-parse"] == 1
assert counts["pushed"] == 19
# HTTP 200 and application/pdf
with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
sink,
cdx_file,
filter_mimetypes=["application/pdf"],
filter_http_statuses=[200, 226],
)
counts = pusher.run()
assert counts["total"] == 20
assert counts["skip-parse"] == 1
assert counts["skip-http_status"] == 10
assert counts["skip-mimetype"] == 2
assert counts["pushed"] == 7
|