diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 182 | ||||
-rw-r--r-- | python/tests/test_savepagenow.py | 160 |
2 files changed, 314 insertions, 28 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index f4e4aae..886f79e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -6,6 +6,7 @@ import os, sys, time import requests import datetime +from collections import namedtuple import wayback.exception from http.client import IncompleteRead @@ -125,31 +126,114 @@ class WaybackClient: raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) return raw_content - def fetch_url_datetime(self, url, datetime): + def fetch_warc_by_url_dt(self, url, datetime): + """ + Helper wrapper that first hits CDX API to get a full CDX row, then + fetches content from wayback + """ cdx_row = self.cdx_client.lookup(url, datetime) return self.fetch_warc_content( cdx_row['warc_path'], cdx_row['warc_offset'], cdx_row['warc_csize']) + def fetch_resource(self, start_url, mimetype=None): + """ + Looks in wayback for a resource starting at the URL, following any + redirects. Returns a ResourceResult object. + + In a for loop: + + lookup best CDX + redirect? + fetch wayback + continue + good? + fetch wayback + return success + bad? + return failure + + got to end? + return failure; too many redirects + """ + next_url = start_url + urls_seen = [start_url] + for i in range(25): + cdx_row = self.cdx_client.lookup_best(next_url, mimetype=mimetype) + if not cdx_row: + return None + if cdx.status_code == 200: + body = self.fetch_warc_content(cdx.warc_path, cdx.warc_offset, cdx_row.warc_csize) + return ResourceResult( + start_url=start_url, + hit=True, + status="success", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=body, + cdx=cdx_row, + ) + elif cdx_row.status_code >= 300 and cdx_row.status_code < 400: + body = self.fetch_warc_content(cdx_row.warc_path, cdx_row.warc_offset, cdx_row.warc_csize) + next_url = body.get_redirect_url() + if next_url in urls_seen: + return ResourceResult( + start_url=start_url, + hit=False, + status="redirect-loop", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=None, + cdx=cdx_row, + ) + urls_seen.append(next_url) + continue + else: + return ResourceResult( + start_url=start_url, + hit=False, + status="remote-status", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=None, + cdx=cdx_row, + ) + return ResourceResult( + start_url=start_url, + hit=False, + status="redirects-exceeded", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=None, + cdx=cdx_row, + ) + class SavePageNowError(Exception): pass -class SavePageNowRemoteError(Exception): - pass +SavePageNowResult = namedtuple('SavePageNowResult', [ + 'success', + 'status', + 'job_id', + 'request_url', + 'terminal_url', + 'terminal_dt', + 'resources', +]) class SavePageNowClient: - def __init__(self, cdx_client=None, - v1endpoint="https://web.archive.org/save/", - v2endpoint="https://web.archive.org/save"): - if cdx_client: - self.cdx_client = cdx_client - else: - self.cdx_client = CdxApiClient() - self.ia_access_key = os.environ.get('IA_ACCESS_KEY') - self.ia_secret_key = os.environ.get('IA_SECRET_KEY') + def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs): + self.ia_access_key = kwargs.get('ia_access_key', + os.environ.get('IA_ACCESS_KEY')) + self.ia_secret_key = kwargs.get('ia_secret_key', + os.environ.get('IA_SECRET_KEY')) self.v2endpoint = v2endpoint self.v2_session = requests_retry_session(retries=5, backoff_factor=3) self.v2_session.headers.update({ @@ -157,45 +241,87 @@ class SavePageNowClient: 'Accept': 'application/json', 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key), }) + self.poll_count = 30 + self.poll_seconds = 3.0 - def save_url_now_v2(self, url): + def save_url_now_v2(self, request_url): """ - Returns a list of URLs, or raises an error on non-success. + Returns a "SavePageNowResult" (namedtuple) if SPN request was processed + at all, or raises an exception if there was an error with SPN itself. + + If SPN2 was unable to fetch the remote content, `success` will be + false and status will be indicated. + + SavePageNowResult fields: + - success: boolean if SPN + - status: "success" or an error message/type + - job_id: returned by API + - request_url: url we asked to fetch + - terminal_url: final primary resource (after any redirects) + - terminal_timestamp: wayback timestamp of final capture + - resources: list of all URLs captured + + TODO: parse SPN error codes and handle better. Eg, non-200 remote + statuses, invalid hosts/URLs, timeouts, backoff, etc. """ if not (self.ia_access_key and self.ia_secret_key): - raise Exception("SPNv2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)") + raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)") resp = self.v2_session.post( self.v2endpoint, data={ - 'url': url, + 'url': request_url, 'capture_all': 1, 'if_not_archived_within': '1d', }, ) if resp.status_code != 200: - raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url)) + raise SavePageNowError("SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)) resp_json = resp.json() - assert resp_json + + if not resp_json or 'job_id' not in resp_json: + raise SavePageNowError( + "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)) + + job_id = resp_json['job_id'] # poll until complete final_json = None - for i in range(90): + for i in range(self.poll_count): resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, resp_json['job_id'])) - resp.raise_for_status() + try: + resp.raise_for_status() + except: + raise SavePageNowError(resp.content) status = resp.json()['status'] - if status == 'success': + if status == 'pending': + time.sleep(self.poll_seconds) + elif status in ('success', 'error'): final_json = resp.json() - if final_json.get('message', '').startswith('The same snapshot had been made'): - raise SavePageNowError("SPN2 re-snapshot within short time window") break - elif status == 'pending': - time.sleep(1.0) else: - raise SavePageNowError("SPN2 status:{} url:{}".format(status, url)) + raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(status, request_url)) if not final_json: raise SavePageNowError("SPN2 timed out (polling count exceeded)") - #print(final_json) - return final_json['resources'] + if final_json['status'] == "success": + return SavePageNowResult( + True, + "success", + job_id, + request_url, + final_json['original_url'], + final_json['timestamp'], + final_json['resources'], + ) + else: + return SavePageNowResult( + False, + final_json.get('status_ext') or final_json['status'], + job_id, + request_url, + None, + None, + None, + ) diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py new file mode 100644 index 0000000..cbc6aef --- /dev/null +++ b/python/tests/test_savepagenow.py @@ -0,0 +1,160 @@ + +import json +import pytest +import responses + +from sandcrawler import SavePageNowClient, SavePageNowError + + +TARGET = "http://dummy-target.dummy" +JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8" +PENDING_BODY = { + "status": "pending", + "job_id": JOB_ID, + "resources": [ + "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js", + "https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js", + "https://cdn.onesignal.com/sdks/OneSignalSDK.js", + ] +} +SUCCESS_BODY = { + "status": "success", + "job_id": JOB_ID, + "original_url": TARGET + "/redirect", + "screenshot": "http://web.archive.org/screenshot/http://brewster.kahle.org/", + "timestamp": "20180326070330", + "duration_sec": 6.203, + "resources": [ + TARGET, + TARGET + "/redirect", + "http://brewster.kahle.org/", + "http://brewster.kahle.org/favicon.ico", + "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg", + "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg", + "http://brewster.kahle.org/files/2017/01/computer-1294045_960_720-300x300.png", + "http://brewster.kahle.org/files/2017/11/20thcenturytimemachineimages_0000.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6041-1-300x225.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6061-768x1024.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6103-300x225.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6132-225x300.jpg", + "http://brewster.kahle.org/files/2018/02/IMG_6138-1-300x225.jpg", + "http://brewster.kahle.org/wp-content/themes/twentyten/images/wordpress.png", + "http://brewster.kahle.org/wp-content/themes/twentyten/style.css", + "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4", + "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4", + "http://platform.twitter.com/widgets.js", + "https://archive-it.org/piwik.js", + "https://platform.twitter.com/jot.html", + "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js", + "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html", + "https://syndication.twitter.com/settings", + "https://www.syndikat.org/en/joint_venture/embed/", + "https://www.syndikat.org/wp-admin/images/w-logo-blue.png", + "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamAdmin.css?ver=1.0", + "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamLoginForm.css?ver=1.0", + "https://www.syndikat.org/wp-content/plugins/user-access-manager/js/functions.js?ver=4.9.4", + "https://www.syndikat.org/wp-content/plugins/wysija-newsletters/css/validationEngine.jquery.css?ver=2.8.1", + "https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png", + "https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1", + "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4", + "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4" + ], + "outlinks":{ + "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695", + "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695" + } +} +ERROR_BODY = { + "status": "error", + "exception": "[Errno -2] Name or service not known", + "status_ext": "error:invalid-host-resolution", + "job_id": JOB_ID, + "message": "Couldn't resolve host for http://example5123.com.", + "resources": [] +} + +@pytest.fixture +def spn_client(): + client = SavePageNowClient( + v2endpoint="http://dummy-spnv2/save", + ia_access_key="dummy-access-key", + ia_secret_key="dummy-secret-key", + ) + client.poll_seconds = 0.0 + return client + +@responses.activate +def test_savepagenow_success(spn_client): + + responses.add(responses.POST, + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) + + resp = spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 4 + + assert resp.success == True + assert resp.status == "success" + assert resp.request_url == TARGET + assert resp.terminal_url == TARGET + "/redirect" + assert resp.terminal_dt == SUCCESS_BODY['timestamp'] + assert resp.resources == SUCCESS_BODY['resources'] + +@responses.activate +def test_savepagenow_remote_error(spn_client): + + responses.add(responses.POST, + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(ERROR_BODY)) + + resp = spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 3 + + assert resp.success == False + assert resp.status == ERROR_BODY['status_ext'] + assert resp.request_url == TARGET + assert resp.terminal_url == None + assert resp.terminal_dt == None + assert resp.resources == None + +@responses.activate +def test_savepagenow_500(spn_client): + + responses.add(responses.POST, + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + responses.add(responses.GET, + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=500, + body=json.dumps(ERROR_BODY)) + + with pytest.raises(SavePageNowError): + resp = spn_client.save_url_now_v2(TARGET) + + assert len(responses.calls) == 2 + |