diff options
Diffstat (limited to 'python/tests/test_savepagenow.py')
-rw-r--r-- | python/tests/test_savepagenow.py | 331 |
1 files changed, 331 insertions, 0 deletions
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py new file mode 100644 index 0000000..add2c60 --- /dev/null +++ b/python/tests/test_savepagenow.py @@ -0,0 +1,331 @@ +import json + +import pytest +import responses +from test_wayback import * + +from sandcrawler import CdxPartial, SavePageNowBackoffError, SavePageNowClient, SavePageNowError + +TARGET = "http://dummy-target.dummy" +JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8" +PENDING_BODY = { + "status": "pending", + "job_id": JOB_ID, + "resources": [ + "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js", + "https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js", + "https://cdn.onesignal.com/sdks/OneSignalSDK.js", + ], +} +SUCCESS_BODY = { + "status": "success", + "job_id": JOB_ID, + "original_url": TARGET + "/redirect", + "screenshot": "http://web.archive.org/screenshot/http://brewster.kahle.org/", + "timestamp": "20180326070330", + "duration_sec": 6.203, + "resources": [ + TARGET, + TARGET + "/redirect", + "http://brewster.kahle.org/", + "http://brewster.kahle.org/favicon.ico", + "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg", + "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg", + "http://brewster.kahle.org/files/2017/01/computer-1294045_960_720-300x300.png", + "http://brewster.kahle.org/files/2017/11/20thcenturytimemachineimages_0000.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6041-1-300x225.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6061-768x1024.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6103-300x225.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6132-225x300.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6138-1-300x225.jpg", + "http://brewster.kahle.org/wp-content/themes/twentyten/images/wordpress.png", + "http://brewster.kahle.org/wp-content/themes/twentyten/style.css", + "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4", + "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4", + "http://platform.twitter.com/widgets.js", + "https://archive-it.org/piwik.js", + "https://platform.twitter.com/jot.html", + "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js", + "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html", + "https://syndication.twitter.com/settings", + "https://www.syndikat.org/en/joint_venture/embed/", + "https://www.syndikat.org/wp-admin/images/w-logo-blue.png", + "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamAdmin.css?ver=1.0", + "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamLoginForm.css?ver=1.0", + "https://www.syndikat.org/wp-content/plugins/user-access-manager/js/functions.js?ver=4.9.4", + "https://www.syndikat.org/wp-content/plugins/wysija-newsletters/css/validationEngine.jquery.css?ver=2.8.1", + "https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png", + "https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1", + "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4", + "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4", + ], + "outlinks": { + "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695", + "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695", + }, +} +ERROR_BODY = { + "status": "error", + "exception": "[Errno -2] Name or service not known", + "status_ext": "error:invalid-host-resolution", + "job_id": JOB_ID, + "message": "Couldn't resolve host for http://example5123.com.", + "resources": [], +} +CDX_SPN_HIT = [ + [ + "urlkey", + "timestamp", + "original", + "mimetype", + "statuscode", + "digest", + "redirect", + "robotflags", + "length", + "offset", + "filename", + ], + [ + "wiki,fatcat)/", + "20180326070330", + TARGET + "/redirect", + "application/pdf", + "200", + CDX_BEST_SHA1B32, + "-", + "-", + "8445", + "108062304", + "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz", + ], +] + + +@pytest.fixture +def spn_client(): + client = SavePageNowClient( + v2endpoint="http://dummy-spnv2/save", + ia_access_key="dummy-access-key", + ia_secret_key="dummy-secret-key", + ) + client.poll_seconds = 0.0 + return client + + +@responses.activate +def test_savepagenow_success(spn_client): + + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + + resp = spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 5 + + assert resp.success is True + assert resp.status == "success" + assert resp.request_url == TARGET + assert resp.terminal_url == TARGET + "/redirect" + assert resp.terminal_dt == SUCCESS_BODY["timestamp"] + assert resp.resources == SUCCESS_BODY["resources"] + + +@responses.activate +def test_savepagenow_remote_error(spn_client): + + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(ERROR_BODY), + ) + + resp = spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 4 + + assert resp.success is False + assert resp.status == ERROR_BODY["status_ext"] + assert resp.request_url == TARGET + assert resp.terminal_url is None + assert resp.terminal_dt is None + assert resp.resources is None + + +@responses.activate +def test_savepagenow_500(spn_client): + + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=500, + body=json.dumps(ERROR_BODY), + ) + + with pytest.raises(SavePageNowError): + spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 3 + + +@responses.activate +def test_savepagenow_no_slots(spn_client): + + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 0, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + + with pytest.raises(SavePageNowBackoffError): + spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 1 + + +@responses.activate +def test_crawl_resource(spn_client, wayback_client): + + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/user", + status=200, + body=json.dumps( + { + "available": 23, + "daily_captures": 60295, + "daily_captures_limit": 300000, + "processing": 1, + } + ), + ) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY, + ) + + print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect")) + resp = spn_client.crawl_resource(TARGET, wayback_client) + + assert len(responses.calls) == 6 + + assert resp.hit is True + assert resp.status == "success" + assert resp.body == WARC_BODY + assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32 + + assert type(resp.cdx) == CdxPartial + with pytest.raises(AttributeError): + print(resp.cdx.warc_path) |