aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py170
-rw-r--r--python/tests/test_wayback.py110
2 files changed, 240 insertions, 40 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 886f79e..1522708 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -15,69 +15,159 @@ from gwb.loader import CDXLoaderFactory
from .misc import b32_hex, requests_retry_session
+
+ResourceResult = namedtuple("ResourceResult", [
+ "start_url",
+ "hit",
+ "status",
+ "terminal_url",
+ "terminal_dt",
+ "terminal_status_code",
+ "body",
+ "cdx",
+])
+
+CdxRow = namedtuple('CdxRow', [
+ 'surt',
+ 'datetime',
+ 'url',
+ 'mimetype',
+ 'status_code',
+ 'sha1b32',
+ 'sha1hex',
+ 'warc_csize',
+ 'warc_offset',
+ 'warc_path',
+])
+
+CdxPartial = namedtuple('CdxPartial', [
+ 'surt',
+ 'datetime',
+ 'url',
+ 'mimetype',
+ 'status_code',
+ 'sha1b32',
+ 'sha1hex',
+])
+
class CdxApiError(Exception):
pass
class CdxApiClient:
- def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"):
+ def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
self.http_session.headers.update({
'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
})
+ self.cdx_auth_token = kwargs.get('cdx_auth_token',
+ os.environ.get('CDX_AUTH_TOKEN'))
+ if self.cdx_auth_token:
+ self.http_session.headers.update({
+ 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
+ })
self.wayback_endpoint = "https://web.archive.org/web/"
- def lookup_latest(self, url, recent_only=True, follow_redirects=False, redirect_depth=0):
+ def _query_api(self, params):
"""
- Looks up most recent HTTP 200 record for the given URL.
-
- Returns a CDX dict, or None if not found.
-
- NOTE: could do authorized lookup using cookie to get all fields?
+ Hits CDX API with a query, parses result into a list of CdxRow
"""
+ resp = self.http_session.get(self.host_url, params=params)
+ if resp.status_code != 200:
+ raise CdxApiError(resp.text)
+ rj = resp.json()
+ if len(rj) <= 1:
+ return None
+ rows = []
+ for raw in rj[1:]:
+ assert len(raw) == 11 # JSON is short
+ row = CdxRow(
+ surt=raw[0],
+ datetime=raw[1],
+ url=raw[2],
+ mimetype=raw[3],
+ status_code=int(raw[4]),
+ sha1b32=raw[5],
+ sha1hex=b32_hex(raw[5]),
+ warc_csize=raw[8],
+ warc_offset=raw[9],
+ warc_path=raw[10],
+ )
+ assert (row.mimetype == "-") or ("-" not in row)
+ rows.append(row)
+ return rows
- if redirect_depth >= 15:
- raise CdxApiError("redirect loop (by iteration count)")
-
- since = datetime.date.today() - datetime.timedelta(weeks=4)
+ def fetch(self, url, datetime):
+ """
+ Fetches a single CDX row by url/datetime. Raises a KeyError if not
+ found, because we expect to be looking up a specific full record.
+ """
+ if len(datetime) != 14:
+ raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
params = {
'url': url,
+ 'from': datetime,
+ 'to': datetime,
'matchType': 'exact',
'limit': -1,
'output': 'json',
}
- if recent_only:
+ resp = self._query_api(params)
+ if not resp:
+ raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
+ row = resp[0]
+ if not (row.url == url and row.datetime == datetime):
+ raise KeyError("CDX url/datetime not found: {} {} (closest: {})".format(url, datetime, row))
+ return row
+
+ def lookup_best(self, url, max_age_days=None, best_mimetype=None):
+ """
+ Fetches multiple CDX rows for the given URL, tries to find the most recent.
+
+ If no matching row is found, return None. Note this is different from fetch.
+ """
+ params = {
+ 'url': url,
+ 'matchType': 'exact',
+ 'limit': -25,
+ 'output': 'json',
+ 'collapse': 'timestamp:6',
+ }
+ if max_age_days:
+ since = datetime.date.today() - datetime.timedelta(days=max_age_days)
params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
- if not follow_redirects:
- params['filter'] = 'statuscode:200'
- resp = self.http_session.get(self.host_url, params=params)
- if resp.status_code != 200:
- raise CdxApiError(resp.text)
- rj = resp.json()
- if len(rj) <= 1:
+ rows = self._query_api(params)
+ if not rows:
return None
- cdx = rj[1]
- assert len(cdx) == 7 # JSON is short
- cdx = dict(
- surt=cdx[0],
- datetime=cdx[1],
- url=cdx[2],
- mimetype=cdx[3],
- http_status=int(cdx[4]),
- sha1b32=cdx[5],
- sha1hex=b32_hex(cdx[5]),
- )
- if follow_redirects and cdx['http_status'] in (301, 302):
- try:
- resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url'])
- except requests.exceptions.TooManyRedirects:
- raise CdxApiError("redirect loop (wayback fetch)")
- next_url = '/'.join(resp.url.split('/')[5:])
- if next_url == url:
- raise CdxApiError("redirect loop (by url)")
- return self.lookup_latest(next_url, redirect_depth=redirect_depth+1)
- return cdx
+
+ def cdx_sort_key(r):
+ """
+ Preference order by status code looks like:
+
+ 200
+ mimetype match
+ most-recent
+ no match
+ most-recent
+ 3xx
+ most-recent
+ 4xx
+ most-recent
+ 5xx
+ most-recent
+
+ This function will create a tuple that can be used to sort in *reverse* order.
+ """
+ return (
+ r.status_code == 200,
+ 0 - r.status_code,
+ r.mimetype == best_mimetype,
+ r.datetime,
+ )
+
+ rows = sorted(rows, key=cdx_sort_key)
+ return rows[-1]
class WaybackError(Exception):
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
new file mode 100644
index 0000000..7e63ec7
--- /dev/null
+++ b/python/tests/test_wayback.py
@@ -0,0 +1,110 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
+
+
+CDX_TARGET = "http://fatcat.wiki/"
+CDX_DT = "20180812220054"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_SINGLE_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_MULTI_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner, but not right mimetype
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner and mimetype, but wrong status code
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # "best"
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # older
+ ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+@pytest.fixture
+def cdx_client():
+ client = CdxApiClient(
+ host_url="http://dummy-cdx/cdx",
+ )
+ return client
+
+@pytest.fixture
+def wayback_client(cdx_client):
+ client = WaybackClient(
+ cdx_client=cdx_client,
+ petabox_webdata_secret="dummy-petabox-secret",
+ )
+ return client
+
+@responses.activate
+def test_cdx_fetch(cdx_client):
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR"
+ assert resp.warc_csize == "8445"
+ assert resp.warc_offset == "108062304"
+ assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
+@responses.activate
+def test_cdx_fetch_errors(cdx_client):
+
+ with pytest.raises(ValueError):
+ resp = cdx_client.fetch(CDX_TARGET, "2019")
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch("http://some-other.com", CDX_DT)
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+ assert len(responses.calls) == 3
+
+@responses.activate
+def test_cdx_lookup_best(cdx_client):
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
+
+ resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == CDX_BEST_SHA1B32
+ assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
+def test_wayback_fetch(wayback_client, mocker):
+ # mock something
+ #mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+ #blah = mocker.Mock()
+ return
+