1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
|
# XXX: some broken MRO thing going on in here due to python3 object wrangling
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
import os, sys
import requests
import wayback.exception
from http.client import IncompleteRead
from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
class CdxApiError(Exception):
pass
class CdxApiClient:
def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"):
self.host_url = host_url
def lookup_latest(self, url):
"""
Looks up most recent HTTP 200 record for the given URL.
Returns a CDX dict, or None if not found.
XXX: should do authorized lookup using cookie to get all fields
"""
resp = requests.get(self.host_url, params={
'url': url,
'matchType': 'exact',
'limit': -1,
'filter': 'statuscode:200',
'output': 'json',
})
if resp.status_code != 200:
raise CDXApiError(resp.text)
rj = resp.json()
if len(rj) <= 1:
return None
cdx = rj[1]
assert len(cdx) == 7 # JSON is short
cdx = dict(
surt=cdx[0],
datetime=cdx[1],
url=cdx[2],
mimetype=cdx[3],
http_status=int(cdx[4]),
sha1b32=cdx[5],
sha1hex=b32_hex(cdx[5]),
)
return cdx
class WaybackError(Exception):
pass
class WaybackClient:
def __init__(self, cdx_client=None, **kwargs):
if cdx_client:
self.cdx_client = cdx_client
else:
self.cdx_client = CdxApiClient()
# /serve/ instead of /download/ doesn't record view count
self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
self.rstore = None
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
webdata_secret=self.petabox_webdata_secret,
download_base_url=self.petabox_base_url))
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
raise WaybackError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
except ValueError as ve:
raise WaybackError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
except EOFError as eofe:
raise WaybackError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
except TypeError as te:
raise WaybackError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
raise WaybackError("archived HTTP response (WARC) was not 200: {}".format(gwb_record.get_status()[0]))
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
return raw_content
def fetch_url_datetime(self, url, datetime):
cdx_row = self.cdx_client.lookup(url, datetime)
return self.fetch_warc_content(
cdx_row['warc_path'],
cdx_row['warc_offset'],
cdx_row['warc_csize'])
class SavePageNowError(Exception):
pass
class SavePageNowClient:
def __init__(self, cdx_client=None, endpoint="https://web.archive.org/save/"):
if cdx_client:
self.cdx_client = cdx_client
else:
self.cdx_client = CdxApiClient()
self.endpoint = endpoint
def save_url_now(self, url):
"""
Returns a tuple (cdx, blob) on success, or raises an error on non-success.
XXX: handle redirects?
"""
resp = requests.get(self.endpoint + url)
if resp.status_code != 200:
raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
body = resp.content
cdx = self.cdx_client.lookup_latest(url)
return (cdx, body)
|