aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py135
1 files changed, 135 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
new file mode 100644
index 0000000..365cf82
--- /dev/null
+++ b/python/sandcrawler/ia.py
@@ -0,0 +1,135 @@
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os, sys
+import requests
+
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+class CdxApiError(Exception):
+ pass
+
+class CdxApiClient:
+
+ def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"):
+ self.host_url = host_url
+
+ def lookup_latest(self, url):
+ """
+ Looks up most recent HTTP 200 record for the given URL.
+
+ Returns a CDX dict, or None if not found.
+
+ XXX: should do authorized lookup using cookie to get all fields
+ """
+
+ resp = requests.get(self.host_url, params={
+ 'url': url,
+ 'matchType': 'exact',
+ 'limit': -1,
+ 'filter': 'statuscode:200',
+ 'output': 'json',
+ })
+ if resp.status_code != 200:
+ raise CDXApiError(resp.text)
+ rj = resp.json()
+ if len(rj) <= 1:
+ return None
+ cdx = rj[1]
+ assert len(cdx) == 7 # JSON is short
+ cdx = dict(
+ surt=cdx[0],
+ datetime=cdx[1],
+ url=cdx[2],
+ mimetype=cdx[3],
+ http_status=int(cdx[4]),
+ sha1b32=cdx[5],
+ sha1hex=b32_hex(cdx[5]),
+ )
+ return cdx
+
+
+class WaybackError(Exception):
+ pass
+
+class WaybackClient:
+
+ def __init__(self, cdx_client=None, **kwargs):
+ if cdx_client:
+ self.cdx_client = cdx_client
+ else:
+ self.cdx_client = CdxApiClient()
+ # /serve/ instead of /download/ doesn't record view count
+ self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
+ self.rstore = None
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ raise WaybackError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ raise WaybackError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ raise WaybackError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ raise WaybackError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ raise WaybackError("archived HTTP response (WARC) was not 200: {}".format(gwb_record.get_status()[0]))
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content
+
+ def fetch_url_datetime(self, url, datetime):
+ cdx_row = self.cdx_client.lookup(url, datetime)
+ return self.fetch_warc_content(
+ cdx_row['warc_path'],
+ cdx_row['warc_offset'],
+ cdx_row['warc_csize'])
+
+
+class SavePageNowError(Exception):
+ pass
+
+class SavePageNowClient:
+
+ def __init__(self, cdx_client=None, endpoint="https://web.archive.org/save/"):
+ if cdx_client:
+ self.cdx_client = cdx_client
+ else:
+ self.cdx_client = CdxApiClient()
+ self.endpoint = endpoint
+
+ def save_url_now(self, url):
+ """
+ Returns a tuple (cdx, blob) on success, or raises an error on non-success.
+
+ XXX: handle redirects?
+ """
+ resp = requests.get(self.endpoint + url)
+ if resp.status_code != 200:
+ raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
+ body = resp.content
+ cdx = self.cdx_client.lookup_latest(url)
+ return (cdx, body)
+