From 6454cdc93424b23f484fc56e1f9f986490d05c2b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 8 Jan 2020 19:30:38 -0800 Subject: wrap up basic (locally testable) ingest refactor --- python/tests/test_wayback.py | 52 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) (limited to 'python/tests') diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 8d15d70..2aafe7c 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -54,8 +54,8 @@ def test_cdx_fetch(cdx_client): assert resp.datetime == CDX_DT assert resp.url == CDX_TARGET assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR" - assert resp.warc_csize == "8445" - assert resp.warc_offset == "108062304" + assert resp.warc_csize == 8445 + assert resp.warc_offset == 108062304 assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" @responses.activate @@ -96,7 +96,17 @@ def test_cdx_lookup_best(cdx_client): assert resp.warc_path == CDX_SINGLE_HIT[1][-1] WARC_TARGET = "http://fatcat.wiki/" -WARC_BODY = b"some stuff" +WARC_BODY = b""" + + + + + +

my big article here

+ blah + + +""" @pytest.fixture def wayback_client(cdx_client, mocker): @@ -116,7 +126,30 @@ def wayback_client(cdx_client, mocker): return client -def test_wayback_fetch(wayback_client, mocker): +@pytest.fixture +def wayback_client_pdf(cdx_client, mocker): + + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + + client = WaybackClient( + cdx_client=cdx_client, + petabox_webdata_secret="dummy-petabox-secret", + ) + # mock out the wayback store with mock stuff + client.rstore = mocker.Mock() + resource = mocker.Mock() + client.rstore.load_resource = mocker.MagicMock(return_value=resource) + resource.get_status = mocker.MagicMock(return_value=[200]) + resource.get_location = mocker.MagicMock(return_value=[WARC_TARGET]) + body = mocker.Mock() + resource.open_raw_content = mocker.MagicMock(return_value=body) + body.read = mocker.MagicMock(return_value=pdf_bytes) + + return client + +@responses.activate +def test_wayback_fetch(wayback_client): resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz") assert resp.body == WARC_BODY assert resp.location == WARC_TARGET @@ -124,3 +157,14 @@ def test_wayback_fetch(wayback_client, mocker): resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz") assert resp == WARC_BODY +@responses.activate +def test_lookup_resource_success(wayback_client): + + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_MULTI_HIT)) + + resp = wayback_client.lookup_resource(CDX_TARGET) + + assert resp.hit == True -- cgit v1.2.3