from sandcrawler.workers import BlackholeSink, CdxLinePusher def test_cdx_line_pusher(): sink = BlackholeSink() # vanilla (only default filters) with open("tests/files/example.cdx", "r") as cdx_file: pusher = CdxLinePusher(sink, cdx_file) counts = pusher.run() assert counts["total"] == 20 assert counts["skip-parse"] == 1 assert counts["pushed"] == 19 # HTTP 200 and application/pdf with open("tests/files/example.cdx", "r") as cdx_file: pusher = CdxLinePusher( sink, cdx_file, filter_mimetypes=["application/pdf"], filter_http_statuses=[200, 226], ) counts = pusher.run() assert counts["total"] == 20 assert counts["skip-parse"] == 1 assert counts["skip-http_status"] == 10 assert counts["skip-mimetype"] == 2 assert counts["pushed"] == 7