refactor CdxApiClient, add tests

- always use auth token and get full CDX rows - simplify to "fetch" (exact url/dt match) and "lookup_best" methods - all redirect stuff will be moved to a higher level
author: Bryan Newbold <bnewbold@archive.org> 2020-01-08 13:19:26 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-08 13:19:29 -0800
commit: 2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch)
tree: 1d2632881b0ad4830594490ea8e2943b8e204494 /python/sandcrawler
parent: 1ca8b792709dde71f350827fdef6e6596dda55a0 (diff)
download: sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz
sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip
1 files changed, 130 insertions, 40 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 886f79e..1522708 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -15,69 +15,159 @@ from gwb.loader import CDXLoaderFactory
 
 from .misc import b32_hex, requests_retry_session
 
+
+ResourceResult = namedtuple("ResourceResult", [
+    "start_url",
+    "hit",
+    "status",
+    "terminal_url",
+    "terminal_dt",
+    "terminal_status_code",
+    "body",
+    "cdx",
+])
+
+CdxRow = namedtuple('CdxRow', [
+    'surt',
+    'datetime',
+    'url',
+    'mimetype',
+    'status_code',
+    'sha1b32',
+    'sha1hex',
+    'warc_csize',
+    'warc_offset',
+    'warc_path',
+])
+
+CdxPartial = namedtuple('CdxPartial', [
+    'surt',
+    'datetime',
+    'url',
+    'mimetype',
+    'status_code',
+    'sha1b32',
+    'sha1hex',
+])
+
 class CdxApiError(Exception):
     pass
 
 class CdxApiClient:
 
-    def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"):
+    def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
         self.host_url = host_url
         self.http_session = requests_retry_session(retries=3, backoff_factor=3)
         self.http_session.headers.update({
             'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
         })
+        self.cdx_auth_token = kwargs.get('cdx_auth_token',
+            os.environ.get('CDX_AUTH_TOKEN'))
+        if self.cdx_auth_token:
+            self.http_session.headers.update({
+                'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
+            })
         self.wayback_endpoint = "https://web.archive.org/web/"
 
-    def lookup_latest(self, url, recent_only=True, follow_redirects=False, redirect_depth=0):
+    def _query_api(self, params):
         """
-        Looks up most recent HTTP 200 record for the given URL.
-
-        Returns a CDX dict, or None if not found.
-
-        NOTE: could do authorized lookup using cookie to get all fields?
+        Hits CDX API with a query, parses result into a list of CdxRow
         """
+        resp = self.http_session.get(self.host_url, params=params)
+        if resp.status_code != 200:
+            raise CdxApiError(resp.text)
+        rj = resp.json()
+        if len(rj) <= 1:
+            return None
+        rows = []
+        for raw in rj[1:]:
+            assert len(raw) == 11    # JSON is short
+            row = CdxRow(
+                surt=raw[0],
+                datetime=raw[1],
+                url=raw[2],
+                mimetype=raw[3],
+                status_code=int(raw[4]),
+                sha1b32=raw[5],
+                sha1hex=b32_hex(raw[5]),
+                warc_csize=raw[8],
+                warc_offset=raw[9],
+                warc_path=raw[10],
+            )
+            assert (row.mimetype == "-") or ("-" not in row)
+            rows.append(row)
+        return rows
 
-        if redirect_depth >= 15:
-            raise CdxApiError("redirect loop (by iteration count)")
-
-        since = datetime.date.today() - datetime.timedelta(weeks=4)
+    def fetch(self, url, datetime):
+        """
+        Fetches a single CDX row by url/datetime. Raises a KeyError if not
+        found, because we expect to be looking up a specific full record.
+        """
+        if len(datetime) != 14:
+            raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
         params = {
             'url': url,
+            'from': datetime,
+            'to': datetime,
             'matchType': 'exact',
             'limit': -1,
             'output': 'json',
         }
-        if recent_only:
+        resp = self._query_api(params)
+        if not resp:
+            raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
+        row = resp[0]
+        if not (row.url == url and row.datetime == datetime):
+            raise KeyError("CDX url/datetime not found: {} {} (closest: {})".format(url, datetime, row))
+        return row
+
+    def lookup_best(self, url, max_age_days=None, best_mimetype=None):
+        """
+        Fetches multiple CDX rows for the given URL, tries to find the most recent.
+
+        If no matching row is found, return None. Note this is different from fetch.
+        """
+        params = {
+            'url': url,
+            'matchType': 'exact',
+            'limit': -25,
+            'output': 'json',
+            'collapse': 'timestamp:6',
+        }
+        if max_age_days:
+            since = datetime.date.today() - datetime.timedelta(days=max_age_days)
             params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
-        if not follow_redirects:
-            params['filter'] = 'statuscode:200'
-        resp = self.http_session.get(self.host_url, params=params)
-        if resp.status_code != 200:
-            raise CdxApiError(resp.text)
-        rj = resp.json()
-        if len(rj) <= 1:
+        rows = self._query_api(params)
+        if not rows:
             return None
-        cdx = rj[1]
-        assert len(cdx) == 7    # JSON is short
-        cdx = dict(
-            surt=cdx[0],
-            datetime=cdx[1],
-            url=cdx[2],
-            mimetype=cdx[3],
-            http_status=int(cdx[4]),
-            sha1b32=cdx[5],
-            sha1hex=b32_hex(cdx[5]),
-        )
-        if follow_redirects and cdx['http_status'] in (301, 302):
-            try:
-                resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url'])
-            except requests.exceptions.TooManyRedirects:
-                raise CdxApiError("redirect loop (wayback fetch)")
-            next_url = '/'.join(resp.url.split('/')[5:])
-            if next_url == url:
-                raise CdxApiError("redirect loop (by url)")
-            return self.lookup_latest(next_url, redirect_depth=redirect_depth+1)
-        return cdx
+
+        def cdx_sort_key(r):
+            """
+            Preference order by status code looks like:
+
+                200
+                    mimetype match
+                        most-recent
+                    no match
+                        most-recent
+                3xx
+                    most-recent
+                4xx
+                    most-recent
+                5xx
+                    most-recent
+
+            This function will create a tuple that can be used to sort in *reverse* order.
+            """
+            return (
+                r.status_code == 200,
+                0 - r.status_code,
+                r.mimetype == best_mimetype,
+                r.datetime,
+            )
+
+        rows = sorted(rows, key=cdx_sort_key)
+        return rows[-1]
 
 
 class WaybackError(Exception):
author	Bryan Newbold <bnewbold@archive.org>	2020-01-08 13:19:26 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-08 13:19:29 -0800
commit	2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch)
tree	1d2632881b0ad4830594490ea8e2943b8e204494 /python/sandcrawler
parent	1ca8b792709dde71f350827fdef6e6596dda55a0 (diff)
download	sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip