from sandcrawler.workers import BlackholeSink, CdxLinePusher


def test_cdx_line_pusher():

    sink = BlackholeSink()

    # vanilla (only default filters)
    with open("tests/files/example.cdx", "r") as cdx_file:
        pusher = CdxLinePusher(sink, cdx_file)
        counts = pusher.run()
    assert counts["total"] == 20
    assert counts["skip-parse"] == 1
    assert counts["pushed"] == 19

    # HTTP 200 and application/pdf
    with open("tests/files/example.cdx", "r") as cdx_file:
        pusher = CdxLinePusher(
            sink,
            cdx_file,
            filter_mimetypes=["application/pdf"],
            filter_http_statuses=[200, 226],
        )
        counts = pusher.run()
    assert counts["total"] == 20
    assert counts["skip-parse"] == 1
    assert counts["skip-http_status"] == 10
    assert counts["skip-mimetype"] == 2
    assert counts["pushed"] == 7