# XXX: some broken MRO thing going on in here due to python3 object wrangling # in `wayback` library. Means we can't run pylint. # pylint: skip-file import os, sys import requests import wayback.exception from http.client import IncompleteRead from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory class CdxApiError(Exception): pass class CdxApiClient: def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"): self.host_url = host_url def lookup_latest(self, url): """ Looks up most recent HTTP 200 record for the given URL. Returns a CDX dict, or None if not found. XXX: should do authorized lookup using cookie to get all fields """ resp = requests.get(self.host_url, params={ 'url': url, 'matchType': 'exact', 'limit': -1, 'filter': 'statuscode:200', 'output': 'json', }) if resp.status_code != 200: raise CDXApiError(resp.text) rj = resp.json() if len(rj) <= 1: return None cdx = rj[1] assert len(cdx) == 7 # JSON is short cdx = dict( surt=cdx[0], datetime=cdx[1], url=cdx[2], mimetype=cdx[3], http_status=int(cdx[4]), sha1b32=cdx[5], sha1hex=b32_hex(cdx[5]), ) return cdx class WaybackError(Exception): pass class WaybackClient: def __init__(self, cdx_client=None, **kwargs): if cdx_client: self.cdx_client = cdx_client else: self.cdx_client = CdxApiClient() # /serve/ instead of /download/ doesn't record view count self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') # gwb library will fall back to reading from /opt/.petabox/webdata.secret self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/') self.rstore = None def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( webdata_secret=self.petabox_webdata_secret, download_base_url=self.petabox_base_url)) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: raise WaybackError("failed to load file contents from wayback/petabox (ResourceUnavailable)") except ValueError as ve: raise WaybackError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) except EOFError as eofe: raise WaybackError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) except TypeError as te: raise WaybackError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. if gwb_record.get_status()[0] != 200: raise WaybackError("archived HTTP response (WARC) was not 200: {}".format(gwb_record.get_status()[0])) try: raw_content = gwb_record.open_raw_content().read() except IncompleteRead as ire: raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) return raw_content def fetch_url_datetime(self, url, datetime): cdx_row = self.cdx_client.lookup(url, datetime) return self.fetch_warc_content( cdx_row['warc_path'], cdx_row['warc_offset'], cdx_row['warc_csize']) class SavePageNowError(Exception): pass class SavePageNowClient: def __init__(self, cdx_client=None, endpoint="https://web.archive.org/save/"): if cdx_client: self.cdx_client = cdx_client else: self.cdx_client = CdxApiClient() self.endpoint = endpoint def save_url_now(self, url): """ Returns a tuple (cdx, blob) on success, or raises an error on non-success. XXX: handle redirects? """ resp = requests.get(self.endpoint + url) if resp.status_code != 200: raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url)) body = resp.content cdx = self.cdx_client.lookup_latest(url) return (cdx, body)