refactor CdxApiClient, add tests

- always use auth token and get full CDX rows - simplify to "fetch" (exact url/dt match) and "lookup_best" methods - all redirect stuff will be moved to a higher level
author: Bryan Newbold <bnewbold@archive.org> 2020-01-08 13:19:26 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-08 13:19:29 -0800
commit: 2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch)
tree: 1d2632881b0ad4830594490ea8e2943b8e204494 /python
parent: 1ca8b792709dde71f350827fdef6e6596dda55a0 (diff)
download: sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz
sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip
2 files changed, 240 insertions, 40 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 886f79e..1522708 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -15,69 +15,159 @@ from gwb.loader import CDXLoaderFactory
 
 from .misc import b32_hex, requests_retry_session
 
+
+ResourceResult = namedtuple("ResourceResult", [
+    "start_url",
+    "hit",
+    "status",
+    "terminal_url",
+    "terminal_dt",
+    "terminal_status_code",
+    "body",
+    "cdx",
+])
+
+CdxRow = namedtuple('CdxRow', [
+    'surt',
+    'datetime',
+    'url',
+    'mimetype',
+    'status_code',
+    'sha1b32',
+    'sha1hex',
+    'warc_csize',
+    'warc_offset',
+    'warc_path',
+])
+
+CdxPartial = namedtuple('CdxPartial', [
+    'surt',
+    'datetime',
+    'url',
+    'mimetype',
+    'status_code',
+    'sha1b32',
+    'sha1hex',
+])
+
 class CdxApiError(Exception):
     pass
 
 class CdxApiClient:
 
-    def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"):
+    def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
         self.host_url = host_url
         self.http_session = requests_retry_session(retries=3, backoff_factor=3)
         self.http_session.headers.update({
             'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
         })
+        self.cdx_auth_token = kwargs.get('cdx_auth_token',
+            os.environ.get('CDX_AUTH_TOKEN'))
+        if self.cdx_auth_token:
+            self.http_session.headers.update({
+                'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
+            })
         self.wayback_endpoint = "https://web.archive.org/web/"
 
-    def lookup_latest(self, url, recent_only=True, follow_redirects=False, redirect_depth=0):
+    def _query_api(self, params):
         """
-        Looks up most recent HTTP 200 record for the given URL.
-
-        Returns a CDX dict, or None if not found.
-
-        NOTE: could do authorized lookup using cookie to get all fields?
+        Hits CDX API with a query, parses result into a list of CdxRow
         """
+        resp = self.http_session.get(self.host_url, params=params)
+        if resp.status_code != 200:
+            raise CdxApiError(resp.text)
+        rj = resp.json()
+        if len(rj) <= 1:
+            return None
+        rows = []
+        for raw in rj[1:]:
+            assert len(raw) == 11    # JSON is short
+            row = CdxRow(
+                surt=raw[0],
+                datetime=raw[1],
+                url=raw[2],
+                mimetype=raw[3],
+                status_code=int(raw[4]),
+                sha1b32=raw[5],
+                sha1hex=b32_hex(raw[5]),
+                warc_csize=raw[8],
+                warc_offset=raw[9],
+                warc_path=raw[10],
+            )
+            assert (row.mimetype == "-") or ("-" not in row)
+            rows.append(row)
+        return rows
 
-        if redirect_depth >= 15:
-            raise CdxApiError("redirect loop (by iteration count)")
-
-        since = datetime.date.today() - datetime.timedelta(weeks=4)
+    def fetch(self, url, datetime):
+        """
+        Fetches a single CDX row by url/datetime. Raises a KeyError if not
+        found, because we expect to be looking up a specific full record.
+        """
+        if len(datetime) != 14:
+            raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
         params = {
             'url': url,
+            'from': datetime,
+            'to': datetime,
             'matchType': 'exact',
             'limit': -1,
             'output': 'json',
         }
-        if recent_only:
+        resp = self._query_api(params)
+        if not resp:
+            raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
+        row = resp[0]
+        if not (row.url == url and row.datetime == datetime):
+            raise KeyError("CDX url/datetime not found: {} {} (closest: {})".format(url, datetime, row))
+        return row
+
+    def lookup_best(self, url, max_age_days=None, best_mimetype=None):
+        """
+        Fetches multiple CDX rows for the given URL, tries to find the most recent.
+
+        If no matching row is found, return None. Note this is different from fetch.
+        """
+        params = {
+            'url': url,
+            'matchType': 'exact',
+            'limit': -25,
+            'output': 'json',
+            'collapse': 'timestamp:6',
+        }
+        if max_age_days:
+            since = datetime.date.today() - datetime.timedelta(days=max_age_days)
             params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
-        if not follow_redirects:
-            params['filter'] = 'statuscode:200'
-        resp = self.http_session.get(self.host_url, params=params)
-        if resp.status_code != 200:
-            raise CdxApiError(resp.text)
-        rj = resp.json()
-        if len(rj) <= 1:
+        rows = self._query_api(params)
+        if not rows:
             return None
-        cdx = rj[1]
-        assert len(cdx) == 7    # JSON is short
-        cdx = dict(
-            surt=cdx[0],
-            datetime=cdx[1],
-            url=cdx[2],
-            mimetype=cdx[3],
-            http_status=int(cdx[4]),
-            sha1b32=cdx[5],
-            sha1hex=b32_hex(cdx[5]),
-        )
-        if follow_redirects and cdx['http_status'] in (301, 302):
-            try:
-                resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url'])
-            except requests.exceptions.TooManyRedirects:
-                raise CdxApiError("redirect loop (wayback fetch)")
-            next_url = '/'.join(resp.url.split('/')[5:])
-            if next_url == url:
-                raise CdxApiError("redirect loop (by url)")
-            return self.lookup_latest(next_url, redirect_depth=redirect_depth+1)
-        return cdx
+
+        def cdx_sort_key(r):
+            """
+            Preference order by status code looks like:
+
+                200
+                    mimetype match
+                        most-recent
+                    no match
+                        most-recent
+                3xx
+                    most-recent
+                4xx
+                    most-recent
+                5xx
+                    most-recent
+
+            This function will create a tuple that can be used to sort in *reverse* order.
+            """
+            return (
+                r.status_code == 200,
+                0 - r.status_code,
+                r.mimetype == best_mimetype,
+                r.datetime,
+            )
+
+        rows = sorted(rows, key=cdx_sort_key)
+        return rows[-1]
 
 
 class WaybackError(Exception):
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
new file mode 100644
index 0000000..7e63ec7
--- /dev/null
+++ b/python/tests/test_wayback.py
@@ -0,0 +1,110 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
+
+
+CDX_TARGET = "http://fatcat.wiki/"
+CDX_DT = "20180812220054"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_SINGLE_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_MULTI_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner, but not right mimetype
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner and mimetype, but wrong status code
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # "best"
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # older
+ ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+@pytest.fixture
+def cdx_client():
+    client = CdxApiClient(
+        host_url="http://dummy-cdx/cdx",
+    )
+    return client
+
+@pytest.fixture
+def wayback_client(cdx_client):
+    client = WaybackClient(
+        cdx_client=cdx_client,
+        petabox_webdata_secret="dummy-petabox-secret",
+    )
+    return client
+
+@responses.activate
+def test_cdx_fetch(cdx_client):
+
+    responses.add(responses.GET,
+        'http://dummy-cdx/cdx',
+        status=200,
+        body=json.dumps(CDX_SINGLE_HIT))
+
+    resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+
+    assert len(responses.calls) == 1
+
+    assert resp.datetime == CDX_DT
+    assert resp.url == CDX_TARGET
+    assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR"
+    assert resp.warc_csize == "8445"
+    assert resp.warc_offset == "108062304"
+    assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
+@responses.activate
+def test_cdx_fetch_errors(cdx_client):
+
+    with pytest.raises(ValueError):
+        resp = cdx_client.fetch(CDX_TARGET, "2019")
+
+    responses.add(responses.GET,
+        'http://dummy-cdx/cdx',
+        status=200,
+        body=json.dumps(CDX_SINGLE_HIT))
+
+    with pytest.raises(KeyError):
+        resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
+
+    with pytest.raises(KeyError):
+        resp = cdx_client.fetch("http://some-other.com", CDX_DT)
+
+    resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+    assert len(responses.calls) == 3
+
+@responses.activate
+def test_cdx_lookup_best(cdx_client):
+
+    responses.add(responses.GET,
+        'http://dummy-cdx/cdx',
+        status=200,
+        body=json.dumps(CDX_MULTI_HIT))
+
+    resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
+
+    assert len(responses.calls) == 1
+
+    assert resp.datetime == CDX_DT
+    assert resp.url == CDX_TARGET
+    assert resp.sha1b32 == CDX_BEST_SHA1B32
+    assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
+def test_wayback_fetch(wayback_client, mocker):
+    # mock something
+    #mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+    #blah = mocker.Mock()
+    return
+
author	Bryan Newbold <bnewbold@archive.org>	2020-01-08 13:19:26 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-08 13:19:29 -0800
commit	2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch)
tree	1d2632881b0ad4830594490ea8e2943b8e204494 /python
parent	1ca8b792709dde71f350827fdef6e6596dda55a0 (diff)
download	sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip