diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 13:19:26 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 13:19:29 -0800 |
commit | 2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch) | |
tree | 1d2632881b0ad4830594490ea8e2943b8e204494 /python/sandcrawler | |
parent | 1ca8b792709dde71f350827fdef6e6596dda55a0 (diff) | |
download | sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip |
refactor CdxApiClient, add tests
- always use auth token and get full CDX rows
- simplify to "fetch" (exact url/dt match) and "lookup_best" methods
- all redirect stuff will be moved to a higher level
Diffstat (limited to 'python/sandcrawler')
-rw-r--r-- | python/sandcrawler/ia.py | 170 |
1 files changed, 130 insertions, 40 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 886f79e..1522708 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -15,69 +15,159 @@ from gwb.loader import CDXLoaderFactory from .misc import b32_hex, requests_retry_session + +ResourceResult = namedtuple("ResourceResult", [ + "start_url", + "hit", + "status", + "terminal_url", + "terminal_dt", + "terminal_status_code", + "body", + "cdx", +]) + +CdxRow = namedtuple('CdxRow', [ + 'surt', + 'datetime', + 'url', + 'mimetype', + 'status_code', + 'sha1b32', + 'sha1hex', + 'warc_csize', + 'warc_offset', + 'warc_path', +]) + +CdxPartial = namedtuple('CdxPartial', [ + 'surt', + 'datetime', + 'url', + 'mimetype', + 'status_code', + 'sha1b32', + 'sha1hex', +]) + class CdxApiError(Exception): pass class CdxApiClient: - def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"): + def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) self.http_session.headers.update({ 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient', }) + self.cdx_auth_token = kwargs.get('cdx_auth_token', + os.environ.get('CDX_AUTH_TOKEN')) + if self.cdx_auth_token: + self.http_session.headers.update({ + 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token), + }) self.wayback_endpoint = "https://web.archive.org/web/" - def lookup_latest(self, url, recent_only=True, follow_redirects=False, redirect_depth=0): + def _query_api(self, params): """ - Looks up most recent HTTP 200 record for the given URL. - - Returns a CDX dict, or None if not found. - - NOTE: could do authorized lookup using cookie to get all fields? + Hits CDX API with a query, parses result into a list of CdxRow """ + resp = self.http_session.get(self.host_url, params=params) + if resp.status_code != 200: + raise CdxApiError(resp.text) + rj = resp.json() + if len(rj) <= 1: + return None + rows = [] + for raw in rj[1:]: + assert len(raw) == 11 # JSON is short + row = CdxRow( + surt=raw[0], + datetime=raw[1], + url=raw[2], + mimetype=raw[3], + status_code=int(raw[4]), + sha1b32=raw[5], + sha1hex=b32_hex(raw[5]), + warc_csize=raw[8], + warc_offset=raw[9], + warc_path=raw[10], + ) + assert (row.mimetype == "-") or ("-" not in row) + rows.append(row) + return rows - if redirect_depth >= 15: - raise CdxApiError("redirect loop (by iteration count)") - - since = datetime.date.today() - datetime.timedelta(weeks=4) + def fetch(self, url, datetime): + """ + Fetches a single CDX row by url/datetime. Raises a KeyError if not + found, because we expect to be looking up a specific full record. + """ + if len(datetime) != 14: + raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)) params = { 'url': url, + 'from': datetime, + 'to': datetime, 'matchType': 'exact', 'limit': -1, 'output': 'json', } - if recent_only: + resp = self._query_api(params) + if not resp: + raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime)) + row = resp[0] + if not (row.url == url and row.datetime == datetime): + raise KeyError("CDX url/datetime not found: {} {} (closest: {})".format(url, datetime, row)) + return row + + def lookup_best(self, url, max_age_days=None, best_mimetype=None): + """ + Fetches multiple CDX rows for the given URL, tries to find the most recent. + + If no matching row is found, return None. Note this is different from fetch. + """ + params = { + 'url': url, + 'matchType': 'exact', + 'limit': -25, + 'output': 'json', + 'collapse': 'timestamp:6', + } + if max_age_days: + since = datetime.date.today() - datetime.timedelta(days=max_age_days) params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day), - if not follow_redirects: - params['filter'] = 'statuscode:200' - resp = self.http_session.get(self.host_url, params=params) - if resp.status_code != 200: - raise CdxApiError(resp.text) - rj = resp.json() - if len(rj) <= 1: + rows = self._query_api(params) + if not rows: return None - cdx = rj[1] - assert len(cdx) == 7 # JSON is short - cdx = dict( - surt=cdx[0], - datetime=cdx[1], - url=cdx[2], - mimetype=cdx[3], - http_status=int(cdx[4]), - sha1b32=cdx[5], - sha1hex=b32_hex(cdx[5]), - ) - if follow_redirects and cdx['http_status'] in (301, 302): - try: - resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url']) - except requests.exceptions.TooManyRedirects: - raise CdxApiError("redirect loop (wayback fetch)") - next_url = '/'.join(resp.url.split('/')[5:]) - if next_url == url: - raise CdxApiError("redirect loop (by url)") - return self.lookup_latest(next_url, redirect_depth=redirect_depth+1) - return cdx + + def cdx_sort_key(r): + """ + Preference order by status code looks like: + + 200 + mimetype match + most-recent + no match + most-recent + 3xx + most-recent + 4xx + most-recent + 5xx + most-recent + + This function will create a tuple that can be used to sort in *reverse* order. + """ + return ( + r.status_code == 200, + 0 - r.status_code, + r.mimetype == best_mimetype, + r.datetime, + ) + + rows = sorted(rows, key=cdx_sort_key) + return rows[-1] class WaybackError(Exception): |