aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-08 23:11:30 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-09 16:32:47 -0800
commitd8689bce0a3a69254554dbdfa929e712e9bbc02c (patch)
treef4c80a798358317aee8657e066dc1c7a37a6ba20 /python
parent6454cdc93424b23f484fc56e1f9f986490d05c2b (diff)
downloadsandcrawler-d8689bce0a3a69254554dbdfa929e712e9bbc02c.tar.gz
sandcrawler-d8689bce0a3a69254554dbdfa929e712e9bbc02c.zip
wayback fetch via replay; confirm hashes in crawl_resource()
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py45
1 files changed, 40 insertions, 5 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index ac0fef8..74cd978 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -212,6 +212,7 @@ class WaybackClient:
self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
self.rstore = None
self.max_redirects = 25
+ self.wayback_endpoint = "https://web.archive.org/web/"
def fetch_petabox(self, c_size, offset, warc_path):
"""
@@ -292,9 +293,20 @@ class WaybackClient:
Intended for use with SPN2 requests, where request body has not ended
up in petabox yet.
- TODO: is this actually necessary?
+ TODO: is this really necessary?
"""
- raise NotImplementedError
+ try:
+ resp = requests.get(self.wayback_endpoint + datetime + "id_/" + url)
+ except requests.exceptions.TooManyRedirects:
+ raise WaybackError("redirect loop (wayback replay fetch)")
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # TODO: some verification here?
+ #print(resp.url, file=sys.stderr)
+ #print(resp.content)
+ return resp.content
def lookup_resource(self, start_url, best_mimetype=None):
"""
@@ -473,6 +485,16 @@ class SavePageNowClient:
if not final_json:
raise SavePageNowError("SPN2 timed out (polling count exceeded)")
+ if final_json.get('original_job_id'):
+ resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id']))
+ try:
+ resp.raise_for_status()
+ except:
+ raise SavePageNowError(resp.content)
+ final_json = resp.json()
+
+ #print(final_json, file=sys.stderr)
+
if final_json['status'] == "success":
return SavePageNowResult(
True,
@@ -515,10 +537,23 @@ class SavePageNowClient:
cdx=None,
)
- # fetch CDX and body
- cdx_row = wayback_client.cdx_client.fetch(spn_result.terminal_url, spn_result.terminal_dt)
+ # sleep one second to let CDX API update (XXX)
+ time.sleep(1.0)
+
+ # fetch CDX, then body
+ cdx_row = wayback_client.cdx_client.fetch(
+ spn_result.terminal_url,
+ spn_result.terminal_dt,
+ filter_status_code=200,
+ )
+ print(spn_result, file=sys.stderr)
+ print(cdx_row, file=sys.stderr)
assert cdx_row.status_code == 200
- body = wayback_client.fetch_petabox_body(cdx_row.warc_csize, cdx_row.warc_offset, cdx_row.warc_path)
+ body = wayback_client.fetch_replay_body(cdx_row.url, cdx_row.datetime)
+ file_meta = gen_file_metadata(body)
+ if cdx_row.sha1hex != file_meta['sha1hex']:
+ print(file_meta, file=sys.stderr)
+ raise WaybackError("replay fetch resulted in non-matching SHA1 (vs. CDX API)")
# not a full CDX yet
cdx_partial = cdx_partial_from_row(cdx_row)