refactor SavePaperNowClient and add test

- response as a namedtuple - "remote" errors (aka, SPN API was HTTP 200 but returned error) aren't an exception
author: Bryan Newbold <bnewbold@archive.org> 2020-01-07 21:49:59 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-07 21:50:02 -0800
commit: 1ca8b792709dde71f350827fdef6e6596dda55a0 (patch)
tree: 0c14ab56dd6483c28b4f1c4ce025c750b9f5971d /python
parent: 6e1b28166db996492736d22cfeba564156ce74fe (diff)
download: sandcrawler-1ca8b792709dde71f350827fdef6e6596dda55a0.tar.gz
sandcrawler-1ca8b792709dde71f350827fdef6e6596dda55a0.zip
2 files changed, 314 insertions, 28 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index f4e4aae..886f79e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -6,6 +6,7 @@
 import os, sys, time
 import requests
 import datetime
+from collections import namedtuple
 
 import wayback.exception
 from http.client import IncompleteRead
@@ -125,31 +126,114 @@ class WaybackClient:
             raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
         return raw_content
 
-    def fetch_url_datetime(self, url, datetime):
+    def fetch_warc_by_url_dt(self, url, datetime):
+        """
+        Helper wrapper that first hits CDX API to get a full CDX row, then
+        fetches content from wayback
+        """
         cdx_row = self.cdx_client.lookup(url, datetime)
         return self.fetch_warc_content(
             cdx_row['warc_path'],
             cdx_row['warc_offset'],
             cdx_row['warc_csize'])
 
+    def fetch_resource(self, start_url, mimetype=None):
+        """
+        Looks in wayback for a resource starting at the URL, following any
+        redirects. Returns a ResourceResult object.
+
+        In a for loop:
+
+            lookup best CDX
+            redirect?
+                fetch wayback
+                continue
+            good?
+                fetch wayback
+                return success
+            bad?
+                return failure
+
+        got to end?
+            return failure; too many redirects
+        """
+        next_url = start_url
+        urls_seen = [start_url]
+        for i in range(25):
+            cdx_row = self.cdx_client.lookup_best(next_url, mimetype=mimetype)
+            if not cdx_row:
+                return None
+            if cdx.status_code == 200:
+                body = self.fetch_warc_content(cdx.warc_path, cdx.warc_offset, cdx_row.warc_csize)
+                return ResourceResult(
+                    start_url=start_url,
+                    hit=True,
+                    status="success",
+                    terminal_url=cdx_row.url,
+                    terminal_dt=cdx_row.datetime,
+                    terminal_status_code=cdx_row.status_code,
+                    body=body,
+                    cdx=cdx_row,
+                )
+            elif cdx_row.status_code >= 300 and cdx_row.status_code < 400:
+                body = self.fetch_warc_content(cdx_row.warc_path, cdx_row.warc_offset, cdx_row.warc_csize)
+                next_url = body.get_redirect_url()
+                if next_url in urls_seen:
+                    return ResourceResult(
+                        start_url=start_url,
+                        hit=False,
+                        status="redirect-loop",
+                        terminal_url=cdx_row.url,
+                        terminal_dt=cdx_row.datetime,
+                        terminal_status_code=cdx_row.status_code,
+                        body=None,
+                        cdx=cdx_row,
+                    )
+                urls_seen.append(next_url)
+                continue
+            else:
+                return ResourceResult(
+                    start_url=start_url,
+                    hit=False,
+                    status="remote-status",
+                    terminal_url=cdx_row.url,
+                    terminal_dt=cdx_row.datetime,
+                    terminal_status_code=cdx_row.status_code,
+                    body=None,
+                    cdx=cdx_row,
+                )
+        return ResourceResult(
+            start_url=start_url,
+            hit=False,
+            status="redirects-exceeded",
+            terminal_url=cdx_row.url,
+            terminal_dt=cdx_row.datetime,
+            terminal_status_code=cdx_row.status_code,
+            body=None,
+            cdx=cdx_row,
+        )
+
 
 class SavePageNowError(Exception):
     pass
 
-class SavePageNowRemoteError(Exception):
-    pass
+SavePageNowResult = namedtuple('SavePageNowResult', [
+    'success',
+    'status',
+    'job_id',
+    'request_url',
+    'terminal_url',
+    'terminal_dt',
+    'resources',
+])
 
 class SavePageNowClient:
 
-    def __init__(self, cdx_client=None,
-            v1endpoint="https://web.archive.org/save/",
-            v2endpoint="https://web.archive.org/save"):
-        if cdx_client:
-            self.cdx_client = cdx_client
-        else:
-            self.cdx_client = CdxApiClient()
-        self.ia_access_key = os.environ.get('IA_ACCESS_KEY')
-        self.ia_secret_key = os.environ.get('IA_SECRET_KEY')
+    def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs):
+        self.ia_access_key = kwargs.get('ia_access_key',
+            os.environ.get('IA_ACCESS_KEY'))
+        self.ia_secret_key = kwargs.get('ia_secret_key',
+            os.environ.get('IA_SECRET_KEY'))
         self.v2endpoint = v2endpoint
         self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
         self.v2_session.headers.update({
@@ -157,45 +241,87 @@ class SavePageNowClient:
             'Accept': 'application/json',
             'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key),
         })
+        self.poll_count = 30
+        self.poll_seconds = 3.0
 
-    def save_url_now_v2(self, url):
+    def save_url_now_v2(self, request_url):
         """
-        Returns a list of URLs, or raises an error on non-success.
+        Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
+        at all, or raises an exception if there was an error with SPN itself.
+
+        If SPN2 was unable to fetch the remote content, `success` will be
+        false and status will be indicated.
+
+        SavePageNowResult fields:
+        - success: boolean if SPN
+        - status: "success" or an error message/type
+        - job_id: returned by API
+        - request_url: url we asked to fetch
+        - terminal_url: final primary resource (after any redirects)
+        - terminal_timestamp: wayback timestamp of final capture
+        - resources: list of all URLs captured
+
+        TODO: parse SPN error codes and handle better. Eg, non-200 remote
+        statuses, invalid hosts/URLs, timeouts, backoff, etc.
         """
         if not (self.ia_access_key and self.ia_secret_key):
-            raise Exception("SPNv2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)")
+            raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)")
         resp = self.v2_session.post(
             self.v2endpoint,
             data={
-                'url': url,
+                'url': request_url,
                 'capture_all': 1,
                 'if_not_archived_within': '1d',
             },
         )
         if resp.status_code != 200:
-            raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
+            raise SavePageNowError("SPN2 status_code: {}, url: {}".format(resp.status_code, request_url))
         resp_json = resp.json()
-        assert resp_json
+
+        if not resp_json or 'job_id' not in resp_json:
+            raise SavePageNowError(
+                "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json))
+
+        job_id = resp_json['job_id']
 
         # poll until complete
         final_json = None
-        for i in range(90):
+        for i in range(self.poll_count):
             resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, resp_json['job_id']))
-            resp.raise_for_status()
+            try:
+                resp.raise_for_status()
+            except:
+                raise SavePageNowError(resp.content)
             status = resp.json()['status']
-            if status == 'success':
+            if status == 'pending':
+                time.sleep(self.poll_seconds)
+            elif status in ('success', 'error'):
                 final_json = resp.json()
-                if final_json.get('message', '').startswith('The same snapshot had been made'):
-                    raise SavePageNowError("SPN2 re-snapshot within short time window")
                 break
-            elif status == 'pending':
-                time.sleep(1.0)
             else:
-                raise SavePageNowError("SPN2 status:{} url:{}".format(status, url))
+                raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(status, request_url))
 
         if not final_json:
             raise SavePageNowError("SPN2 timed out (polling count exceeded)")
 
-        #print(final_json)
-        return final_json['resources']
+        if final_json['status'] == "success":
+            return SavePageNowResult(
+                True,
+                "success",
+                job_id,
+                request_url,
+                final_json['original_url'],
+                final_json['timestamp'],
+                final_json['resources'],
+            )
+        else:
+            return SavePageNowResult(
+                False,
+                final_json.get('status_ext') or final_json['status'],
+                job_id,
+                request_url,
+                None,
+                None,
+                None,
+            )
 
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
new file mode 100644
index 0000000..cbc6aef
--- /dev/null
+++ b/python/tests/test_savepagenow.py
@@ -0,0 +1,160 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import SavePageNowClient, SavePageNowError
+
+
+TARGET = "http://dummy-target.dummy"
+JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
+PENDING_BODY = {
+    "status": "pending",
+    "job_id": JOB_ID,
+    "resources": [
+        "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js",
+        "https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js",
+        "https://cdn.onesignal.com/sdks/OneSignalSDK.js",
+    ]
+}
+SUCCESS_BODY = {
+    "status": "success",
+    "job_id": JOB_ID,
+    "original_url": TARGET + "/redirect",
+    "screenshot": "http://web.archive.org/screenshot/http://brewster.kahle.org/",
+    "timestamp": "20180326070330",
+    "duration_sec": 6.203,
+    "resources": [
+        TARGET,
+        TARGET + "/redirect",
+        "http://brewster.kahle.org/",
+        "http://brewster.kahle.org/favicon.ico",
+        "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg",
+        "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg",
+        "http://brewster.kahle.org/files/2017/01/computer-1294045_960_720-300x300.png",
+        "http://brewster.kahle.org/files/2017/11/20thcenturytimemachineimages_0000.jpg",
+        "http://brewster.kahle.org/files/2018/02/IMG_6041-1-300x225.jpg",
+        "http://brewster.kahle.org/files/2018/02/IMG_6061-768x1024.jpg",
+        "http://brewster.kahle.org/files/2018/02/IMG_6103-300x225.jpg",
+        "http://brewster.kahle.org/files/2018/02/IMG_6132-225x300.jpg",
+        "http://brewster.kahle.org/files/2018/02/IMG_6138-1-300x225.jpg",
+        "http://brewster.kahle.org/wp-content/themes/twentyten/images/wordpress.png",
+        "http://brewster.kahle.org/wp-content/themes/twentyten/style.css",
+        "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4",
+        "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
+        "http://platform.twitter.com/widgets.js",
+        "https://archive-it.org/piwik.js",
+        "https://platform.twitter.com/jot.html",
+        "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js",
+        "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html",
+        "https://syndication.twitter.com/settings",
+        "https://www.syndikat.org/en/joint_venture/embed/",
+        "https://www.syndikat.org/wp-admin/images/w-logo-blue.png",
+        "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamAdmin.css?ver=1.0",
+        "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamLoginForm.css?ver=1.0",
+        "https://www.syndikat.org/wp-content/plugins/user-access-manager/js/functions.js?ver=4.9.4",
+        "https://www.syndikat.org/wp-content/plugins/wysija-newsletters/css/validationEngine.jquery.css?ver=2.8.1",
+        "https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png",
+        "https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1",
+        "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
+        "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
+    ],
+    "outlinks":{
+        "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
+        "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
+    }
+}
+ERROR_BODY = {
+    "status": "error",
+    "exception": "[Errno -2] Name or service not known",
+    "status_ext": "error:invalid-host-resolution",
+    "job_id": JOB_ID,
+    "message": "Couldn't resolve host for http://example5123.com.",
+    "resources": []
+}
+
+@pytest.fixture
+def spn_client():
+    client = SavePageNowClient(
+        v2endpoint="http://dummy-spnv2/save",
+        ia_access_key="dummy-access-key",
+        ia_secret_key="dummy-secret-key",
+    )
+    client.poll_seconds = 0.0
+    return client
+
+@responses.activate
+def test_savepagenow_success(spn_client):
+
+    responses.add(responses.POST,
+        'http://dummy-spnv2/save',
+        status=200,
+        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+    responses.add(responses.GET,
+        'http://dummy-spnv2/save/status/' + JOB_ID,
+        status=200,
+        body=json.dumps(PENDING_BODY))
+    responses.add(responses.GET,
+        'http://dummy-spnv2/save/status/' + JOB_ID,
+        status=200,
+        body=json.dumps(PENDING_BODY))
+    responses.add(responses.GET,
+        'http://dummy-spnv2/save/status/' + JOB_ID,
+        status=200,
+        body=json.dumps(SUCCESS_BODY))
+
+    resp = spn_client.save_url_now_v2(TARGET)
+
+    assert len(responses.calls) == 4
+
+    assert resp.success == True
+    assert resp.status == "success"
+    assert resp.request_url == TARGET
+    assert resp.terminal_url == TARGET + "/redirect"
+    assert resp.terminal_dt == SUCCESS_BODY['timestamp']
+    assert resp.resources == SUCCESS_BODY['resources']
+
+@responses.activate
+def test_savepagenow_remote_error(spn_client):
+
+    responses.add(responses.POST,
+        'http://dummy-spnv2/save',
+        status=200,
+        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+    responses.add(responses.GET,
+        'http://dummy-spnv2/save/status/' + JOB_ID,
+        status=200,
+        body=json.dumps(PENDING_BODY))
+    responses.add(responses.GET,
+        'http://dummy-spnv2/save/status/' + JOB_ID,
+        status=200,
+        body=json.dumps(ERROR_BODY))
+
+    resp = spn_client.save_url_now_v2(TARGET)
+
+    assert len(responses.calls) == 3
+
+    assert resp.success == False
+    assert resp.status == ERROR_BODY['status_ext']
+    assert resp.request_url == TARGET
+    assert resp.terminal_url == None
+    assert resp.terminal_dt == None
+    assert resp.resources == None
+
+@responses.activate
+def test_savepagenow_500(spn_client):
+
+    responses.add(responses.POST,
+        'http://dummy-spnv2/save',
+        status=200,
+        body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+    responses.add(responses.GET,
+        'http://dummy-spnv2/save/status/' + JOB_ID,
+        status=500,
+        body=json.dumps(ERROR_BODY))
+
+    with pytest.raises(SavePageNowError):
+        resp = spn_client.save_url_now_v2(TARGET)
+
+    assert len(responses.calls) == 2
+
author	Bryan Newbold <bnewbold@archive.org>	2020-01-07 21:49:59 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-07 21:50:02 -0800
commit	1ca8b792709dde71f350827fdef6e6596dda55a0 (patch)
tree	0c14ab56dd6483c28b4f1c4ce025c750b9f5971d /python
parent	6e1b28166db996492736d22cfeba564156ce74fe (diff)
download	sandcrawler-1ca8b792709dde71f350827fdef6e6596dda55a0.tar.gz sandcrawler-1ca8b792709dde71f350827fdef6e6596dda55a0.zip