diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 13:19:26 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 13:19:29 -0800 |
commit | 2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch) | |
tree | 1d2632881b0ad4830594490ea8e2943b8e204494 | |
parent | 1ca8b792709dde71f350827fdef6e6596dda55a0 (diff) | |
download | sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip |
refactor CdxApiClient, add tests
- always use auth token and get full CDX rows
- simplify to "fetch" (exact url/dt match) and "lookup_best" methods
- all redirect stuff will be moved to a higher level
-rw-r--r-- | python/sandcrawler/ia.py | 170 | ||||
-rw-r--r-- | python/tests/test_wayback.py | 110 |
2 files changed, 240 insertions, 40 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 886f79e..1522708 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -15,69 +15,159 @@ from gwb.loader import CDXLoaderFactory from .misc import b32_hex, requests_retry_session + +ResourceResult = namedtuple("ResourceResult", [ + "start_url", + "hit", + "status", + "terminal_url", + "terminal_dt", + "terminal_status_code", + "body", + "cdx", +]) + +CdxRow = namedtuple('CdxRow', [ + 'surt', + 'datetime', + 'url', + 'mimetype', + 'status_code', + 'sha1b32', + 'sha1hex', + 'warc_csize', + 'warc_offset', + 'warc_path', +]) + +CdxPartial = namedtuple('CdxPartial', [ + 'surt', + 'datetime', + 'url', + 'mimetype', + 'status_code', + 'sha1b32', + 'sha1hex', +]) + class CdxApiError(Exception): pass class CdxApiClient: - def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"): + def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) self.http_session.headers.update({ 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient', }) + self.cdx_auth_token = kwargs.get('cdx_auth_token', + os.environ.get('CDX_AUTH_TOKEN')) + if self.cdx_auth_token: + self.http_session.headers.update({ + 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token), + }) self.wayback_endpoint = "https://web.archive.org/web/" - def lookup_latest(self, url, recent_only=True, follow_redirects=False, redirect_depth=0): + def _query_api(self, params): """ - Looks up most recent HTTP 200 record for the given URL. - - Returns a CDX dict, or None if not found. - - NOTE: could do authorized lookup using cookie to get all fields? + Hits CDX API with a query, parses result into a list of CdxRow """ + resp = self.http_session.get(self.host_url, params=params) + if resp.status_code != 200: + raise CdxApiError(resp.text) + rj = resp.json() + if len(rj) <= 1: + return None + rows = [] + for raw in rj[1:]: + assert len(raw) == 11 # JSON is short + row = CdxRow( + surt=raw[0], + datetime=raw[1], + url=raw[2], + mimetype=raw[3], + status_code=int(raw[4]), + sha1b32=raw[5], + sha1hex=b32_hex(raw[5]), + warc_csize=raw[8], + warc_offset=raw[9], + warc_path=raw[10], + ) + assert (row.mimetype == "-") or ("-" not in row) + rows.append(row) + return rows - if redirect_depth >= 15: - raise CdxApiError("redirect loop (by iteration count)") - - since = datetime.date.today() - datetime.timedelta(weeks=4) + def fetch(self, url, datetime): + """ + Fetches a single CDX row by url/datetime. Raises a KeyError if not + found, because we expect to be looking up a specific full record. + """ + if len(datetime) != 14: + raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)) params = { 'url': url, + 'from': datetime, + 'to': datetime, 'matchType': 'exact', 'limit': -1, 'output': 'json', } - if recent_only: + resp = self._query_api(params) + if not resp: + raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime)) + row = resp[0] + if not (row.url == url and row.datetime == datetime): + raise KeyError("CDX url/datetime not found: {} {} (closest: {})".format(url, datetime, row)) + return row + + def lookup_best(self, url, max_age_days=None, best_mimetype=None): + """ + Fetches multiple CDX rows for the given URL, tries to find the most recent. + + If no matching row is found, return None. Note this is different from fetch. + """ + params = { + 'url': url, + 'matchType': 'exact', + 'limit': -25, + 'output': 'json', + 'collapse': 'timestamp:6', + } + if max_age_days: + since = datetime.date.today() - datetime.timedelta(days=max_age_days) params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day), - if not follow_redirects: - params['filter'] = 'statuscode:200' - resp = self.http_session.get(self.host_url, params=params) - if resp.status_code != 200: - raise CdxApiError(resp.text) - rj = resp.json() - if len(rj) <= 1: + rows = self._query_api(params) + if not rows: return None - cdx = rj[1] - assert len(cdx) == 7 # JSON is short - cdx = dict( - surt=cdx[0], - datetime=cdx[1], - url=cdx[2], - mimetype=cdx[3], - http_status=int(cdx[4]), - sha1b32=cdx[5], - sha1hex=b32_hex(cdx[5]), - ) - if follow_redirects and cdx['http_status'] in (301, 302): - try: - resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url']) - except requests.exceptions.TooManyRedirects: - raise CdxApiError("redirect loop (wayback fetch)") - next_url = '/'.join(resp.url.split('/')[5:]) - if next_url == url: - raise CdxApiError("redirect loop (by url)") - return self.lookup_latest(next_url, redirect_depth=redirect_depth+1) - return cdx + + def cdx_sort_key(r): + """ + Preference order by status code looks like: + + 200 + mimetype match + most-recent + no match + most-recent + 3xx + most-recent + 4xx + most-recent + 5xx + most-recent + + This function will create a tuple that can be used to sort in *reverse* order. + """ + return ( + r.status_code == 200, + 0 - r.status_code, + r.mimetype == best_mimetype, + r.datetime, + ) + + rows = sorted(rows, key=cdx_sort_key) + return rows[-1] class WaybackError(Exception): diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py new file mode 100644 index 0000000..7e63ec7 --- /dev/null +++ b/python/tests/test_wayback.py @@ -0,0 +1,110 @@ + +import json +import pytest +import responses + +from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError + + +CDX_TARGET = "http://fatcat.wiki/" +CDX_DT = "20180812220054" +# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ +CDX_SINGLE_HIT = [ + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], +] + +CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR" +# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ +CDX_MULTI_HIT = [ + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # sooner, but not right mimetype + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # sooner and mimetype, but wrong status code + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # "best" + ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # older + ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], +] + +@pytest.fixture +def cdx_client(): + client = CdxApiClient( + host_url="http://dummy-cdx/cdx", + ) + return client + +@pytest.fixture +def wayback_client(cdx_client): + client = WaybackClient( + cdx_client=cdx_client, + petabox_webdata_secret="dummy-petabox-secret", + ) + return client + +@responses.activate +def test_cdx_fetch(cdx_client): + + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SINGLE_HIT)) + + resp = cdx_client.fetch(CDX_TARGET, CDX_DT) + + assert len(responses.calls) == 1 + + assert resp.datetime == CDX_DT + assert resp.url == CDX_TARGET + assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR" + assert resp.warc_csize == "8445" + assert resp.warc_offset == "108062304" + assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + +@responses.activate +def test_cdx_fetch_errors(cdx_client): + + with pytest.raises(ValueError): + resp = cdx_client.fetch(CDX_TARGET, "2019") + + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SINGLE_HIT)) + + with pytest.raises(KeyError): + resp = cdx_client.fetch(CDX_TARGET, "20180812220055") + + with pytest.raises(KeyError): + resp = cdx_client.fetch("http://some-other.com", CDX_DT) + + resp = cdx_client.fetch(CDX_TARGET, CDX_DT) + assert len(responses.calls) == 3 + +@responses.activate +def test_cdx_lookup_best(cdx_client): + + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_MULTI_HIT)) + + resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf") + + assert len(responses.calls) == 1 + + assert resp.datetime == CDX_DT + assert resp.url == CDX_TARGET + assert resp.sha1b32 == CDX_BEST_SHA1B32 + assert resp.warc_path == CDX_SINGLE_HIT[1][-1] + +def test_wayback_fetch(wayback_client, mocker): + # mock something + #mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + #blah = mocker.Mock() + return + |