diff options
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r-- | python/sandcrawler/ia.py | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py new file mode 100644 index 0000000..365cf82 --- /dev/null +++ b/python/sandcrawler/ia.py @@ -0,0 +1,135 @@ + +# XXX: some broken MRO thing going on in here due to python3 object wrangling +# in `wayback` library. Means we can't run pylint. +# pylint: skip-file + +import os, sys +import requests + +import wayback.exception +from http.client import IncompleteRead +from wayback.resourcestore import ResourceStore +from gwb.loader import CDXLoaderFactory + +class CdxApiError(Exception): + pass + +class CdxApiClient: + + def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"): + self.host_url = host_url + + def lookup_latest(self, url): + """ + Looks up most recent HTTP 200 record for the given URL. + + Returns a CDX dict, or None if not found. + + XXX: should do authorized lookup using cookie to get all fields + """ + + resp = requests.get(self.host_url, params={ + 'url': url, + 'matchType': 'exact', + 'limit': -1, + 'filter': 'statuscode:200', + 'output': 'json', + }) + if resp.status_code != 200: + raise CDXApiError(resp.text) + rj = resp.json() + if len(rj) <= 1: + return None + cdx = rj[1] + assert len(cdx) == 7 # JSON is short + cdx = dict( + surt=cdx[0], + datetime=cdx[1], + url=cdx[2], + mimetype=cdx[3], + http_status=int(cdx[4]), + sha1b32=cdx[5], + sha1hex=b32_hex(cdx[5]), + ) + return cdx + + +class WaybackError(Exception): + pass + +class WaybackClient: + + def __init__(self, cdx_client=None, **kwargs): + if cdx_client: + self.cdx_client = cdx_client + else: + self.cdx_client = CdxApiClient() + # /serve/ instead of /download/ doesn't record view count + self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + # gwb library will fall back to reading from /opt/.petabox/webdata.secret + self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/') + self.rstore = None + + def fetch_warc_content(self, warc_path, offset, c_size): + warc_uri = self.warc_uri_prefix + warc_path + if not self.rstore: + self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url)) + try: + gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) + except wayback.exception.ResourceUnavailable: + raise WaybackError("failed to load file contents from wayback/petabox (ResourceUnavailable)") + except ValueError as ve: + raise WaybackError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + except EOFError as eofe: + raise WaybackError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + except TypeError as te: + raise WaybackError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + # Note: could consider a generic "except Exception" here, as we get so + # many petabox errors. Do want jobs to fail loud and clear when the + # whole cluster is down though. + + if gwb_record.get_status()[0] != 200: + raise WaybackError("archived HTTP response (WARC) was not 200: {}".format(gwb_record.get_status()[0])) + + try: + raw_content = gwb_record.open_raw_content().read() + except IncompleteRead as ire: + raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return raw_content + + def fetch_url_datetime(self, url, datetime): + cdx_row = self.cdx_client.lookup(url, datetime) + return self.fetch_warc_content( + cdx_row['warc_path'], + cdx_row['warc_offset'], + cdx_row['warc_csize']) + + +class SavePageNowError(Exception): + pass + +class SavePageNowClient: + + def __init__(self, cdx_client=None, endpoint="https://web.archive.org/save/"): + if cdx_client: + self.cdx_client = cdx_client + else: + self.cdx_client = CdxApiClient() + self.endpoint = endpoint + + def save_url_now(self, url): + """ + Returns a tuple (cdx, blob) on success, or raises an error on non-success. + + XXX: handle redirects? + """ + resp = requests.get(self.endpoint + url) + if resp.status_code != 200: + raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url)) + body = resp.content + cdx = self.cdx_client.lookup_latest(url) + return (cdx, body) + |