diff options
Diffstat (limited to 'python/tests/test_savepagenow.py')
-rw-r--r-- | python/tests/test_savepagenow.py | 265 |
1 files changed, 196 insertions, 69 deletions
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index 63dd887..add2c60 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -1,11 +1,10 @@ - import json + import pytest import responses - -from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial from test_wayback import * +from sandcrawler import CdxPartial, SavePageNowBackoffError, SavePageNowClient, SavePageNowError TARGET = "http://dummy-target.dummy" JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8" @@ -16,7 +15,7 @@ PENDING_BODY = { "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js", "https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js", "https://cdn.onesignal.com/sdks/OneSignalSDK.js", - ] + ], } SUCCESS_BODY = { "status": "success", @@ -58,12 +57,12 @@ SUCCESS_BODY = { "https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png", "https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1", "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4", - "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4" + "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4", ], - "outlinks":{ + "outlinks": { "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695", - "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695" - } + "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695", + }, } ERROR_BODY = { "status": "error", @@ -71,13 +70,38 @@ ERROR_BODY = { "status_ext": "error:invalid-host-resolution", "job_id": JOB_ID, "message": "Couldn't resolve host for http://example5123.com.", - "resources": [] + "resources": [], } CDX_SPN_HIT = [ - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], - ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"], + [ + "urlkey", + "timestamp", + "original", + "mimetype", + "statuscode", + "digest", + "redirect", + "robotflags", + "length", + "offset", + "filename", + ], + [ + "wiki,fatcat)/", + "20180326070330", + TARGET + "/redirect", + "application/pdf", + "200", + CDX_BEST_SHA1B32, + "-", + "-", + "8445", + "108062304", + "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz", + ], ] + @pytest.fixture def spn_client(): client = SavePageNowClient( @@ -88,112 +112,216 @@ def spn_client(): client.poll_seconds = 0.0 return client + @responses.activate def test_savepagenow_success(spn_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=200, - body=json.dumps(SUCCESS_BODY)) + body=json.dumps(SUCCESS_BODY), + ) resp = spn_client.save_url_now_v2(TARGET) - assert len(responses.calls) == 4 + assert len(responses.calls) == 5 - assert resp.success == True + assert resp.success is True assert resp.status == "success" assert resp.request_url == TARGET assert resp.terminal_url == TARGET + "/redirect" - assert resp.terminal_dt == SUCCESS_BODY['timestamp'] - assert resp.resources == SUCCESS_BODY['resources'] + assert resp.terminal_dt == SUCCESS_BODY["timestamp"] + assert resp.resources == SUCCESS_BODY["resources"] + @responses.activate def test_savepagenow_remote_error(spn_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=200, - body=json.dumps(ERROR_BODY)) + body=json.dumps(ERROR_BODY), + ) resp = spn_client.save_url_now_v2(TARGET) - assert len(responses.calls) == 3 + assert len(responses.calls) == 4 - assert resp.success == False - assert resp.status == ERROR_BODY['status_ext'] + assert resp.success is False + assert resp.status == ERROR_BODY["status_ext"] assert resp.request_url == TARGET - assert resp.terminal_url == None - assert resp.terminal_dt == None - assert resp.resources == None + assert resp.terminal_url is None + assert resp.terminal_dt is None + assert resp.resources is None + @responses.activate def test_savepagenow_500(spn_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=500, - body=json.dumps(ERROR_BODY)) + body=json.dumps(ERROR_BODY), + ) with pytest.raises(SavePageNowError): - resp = spn_client.save_url_now_v2(TARGET) + spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 3 + + +@responses.activate +def test_savepagenow_no_slots(spn_client): + + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 0, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + + with pytest.raises(SavePageNowBackoffError): + spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 1 - assert len(responses.calls) == 2 @responses.activate def test_crawl_resource(spn_client, wayback_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=200, - body=json.dumps(SUCCESS_BODY)) - responses.add(responses.GET, - 'http://dummy-cdx/cdx', + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, status=200, - body=json.dumps(CDX_SPN_HIT)) - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), status=200, headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=WARC_BODY) + body=WARC_BODY, + ) - print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect")) + print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect")) resp = spn_client.crawl_resource(TARGET, wayback_client) - assert len(responses.calls) == 5 + assert len(responses.calls) == 6 - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.body == WARC_BODY assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32 @@ -201,4 +329,3 @@ def test_crawl_resource(spn_client, wayback_client): assert type(resp.cdx) == CdxPartial with pytest.raises(AttributeError): print(resp.cdx.warc_path) - |