From 51e2b302d223dc79c38dc0339e66719fd38f067c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 8 Jan 2020 17:03:59 -0800 Subject: more wayback and SPN tests and fixes --- python/tests/test_savepagenow.py | 40 +++++++++++++++++++++++++++++++++++++++- python/tests/test_wayback.py | 40 ++++++++++++++++++++++++++++------------ 2 files changed, 67 insertions(+), 13 deletions(-) (limited to 'python/tests') diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index cbc6aef..8681575 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -3,7 +3,8 @@ import json import pytest import responses -from sandcrawler import SavePageNowClient, SavePageNowError +from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial +from test_wayback import * TARGET = "http://dummy-target.dummy" @@ -72,6 +73,10 @@ ERROR_BODY = { "message": "Couldn't resolve host for http://example5123.com.", "resources": [] } +CDX_SPN_HIT = [ + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"], +] @pytest.fixture def spn_client(): @@ -158,3 +163,36 @@ def test_savepagenow_500(spn_client): assert len(responses.calls) == 2 +@responses.activate +def test_crawl_resource(spn_client, wayback_client): + + responses.add(responses.POST, + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SPN_HIT)) + + resp = spn_client.crawl_resource(TARGET, wayback_client) + + assert len(responses.calls) == 4 + + assert resp.hit == True + assert resp.status == "success" + assert resp.body == WARC_BODY + assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32 + + assert type(resp.cdx) == CdxPartial + with pytest.raises(AttributeError): + print(resp.cdx.warc_path) + diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 7e63ec7..eeb4b37 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -35,14 +35,7 @@ CDX_MULTI_HIT = [ def cdx_client(): client = CdxApiClient( host_url="http://dummy-cdx/cdx", - ) - return client - -@pytest.fixture -def wayback_client(cdx_client): - client = WaybackClient( - cdx_client=cdx_client, - petabox_webdata_secret="dummy-petabox-secret", + cdx_auth_token="dummy-token", ) return client @@ -102,9 +95,32 @@ def test_cdx_lookup_best(cdx_client): assert resp.sha1b32 == CDX_BEST_SHA1B32 assert resp.warc_path == CDX_SINGLE_HIT[1][-1] +WARC_TARGET = "http://fatcat.wiki/" +WARC_BODY = "some stuff" + +@pytest.fixture +def wayback_client(cdx_client, mocker): + client = WaybackClient( + cdx_client=cdx_client, + petabox_webdata_secret="dummy-petabox-secret", + ) + # mock out the wayback store with mock stuff + client.rstore = mocker.Mock() + resource = mocker.Mock() + client.rstore.load_resource = mocker.MagicMock(return_value=resource) + resource.get_status = mocker.MagicMock(return_value=[200]) + resource.get_location = mocker.MagicMock(return_value=[WARC_TARGET]) + body = mocker.Mock() + resource.open_raw_content = mocker.MagicMock(return_value=body) + body.read = mocker.MagicMock(return_value=WARC_BODY) + + return client + def test_wayback_fetch(wayback_client, mocker): - # mock something - #mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') - #blah = mocker.Mock() - return + resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz") + assert resp.body == WARC_BODY + assert resp.location == WARC_TARGET + + resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz") + assert resp == WARC_BODY -- cgit v1.2.3