aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_savepagenow.py40
-rw-r--r--python/tests/test_wayback.py40
2 files changed, 67 insertions, 13 deletions
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index cbc6aef..8681575 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -3,7 +3,8 @@ import json
import pytest
import responses
-from sandcrawler import SavePageNowClient, SavePageNowError
+from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial
+from test_wayback import *
TARGET = "http://dummy-target.dummy"
@@ -72,6 +73,10 @@ ERROR_BODY = {
"message": "Couldn't resolve host for http://example5123.com.",
"resources": []
}
+CDX_SPN_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"],
+]
@pytest.fixture
def spn_client():
@@ -158,3 +163,36 @@ def test_savepagenow_500(spn_client):
assert len(responses.calls) == 2
+@responses.activate
+def test_crawl_resource(spn_client, wayback_client):
+
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
+
+ resp = spn_client.crawl_resource(TARGET, wayback_client)
+
+ assert len(responses.calls) == 4
+
+ assert resp.hit == True
+ assert resp.status == "success"
+ assert resp.body == WARC_BODY
+ assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
+
+ assert type(resp.cdx) == CdxPartial
+ with pytest.raises(AttributeError):
+ print(resp.cdx.warc_path)
+
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 7e63ec7..eeb4b37 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -35,14 +35,7 @@ CDX_MULTI_HIT = [
def cdx_client():
client = CdxApiClient(
host_url="http://dummy-cdx/cdx",
- )
- return client
-
-@pytest.fixture
-def wayback_client(cdx_client):
- client = WaybackClient(
- cdx_client=cdx_client,
- petabox_webdata_secret="dummy-petabox-secret",
+ cdx_auth_token="dummy-token",
)
return client
@@ -102,9 +95,32 @@ def test_cdx_lookup_best(cdx_client):
assert resp.sha1b32 == CDX_BEST_SHA1B32
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+WARC_TARGET = "http://fatcat.wiki/"
+WARC_BODY = "<html>some stuff</html>"
+
+@pytest.fixture
+def wayback_client(cdx_client, mocker):
+ client = WaybackClient(
+ cdx_client=cdx_client,
+ petabox_webdata_secret="dummy-petabox-secret",
+ )
+ # mock out the wayback store with mock stuff
+ client.rstore = mocker.Mock()
+ resource = mocker.Mock()
+ client.rstore.load_resource = mocker.MagicMock(return_value=resource)
+ resource.get_status = mocker.MagicMock(return_value=[200])
+ resource.get_location = mocker.MagicMock(return_value=[WARC_TARGET])
+ body = mocker.Mock()
+ resource.open_raw_content = mocker.MagicMock(return_value=body)
+ body.read = mocker.MagicMock(return_value=WARC_BODY)
+
+ return client
+
def test_wayback_fetch(wayback_client, mocker):
- # mock something
- #mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
- #blah = mocker.Mock()
- return
+ resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
+ assert resp.body == WARC_BODY
+ assert resp.location == WARC_TARGET
+
+ resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
+ assert resp == WARC_BODY