diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 15:37:42 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 16:33:21 -0800 |
commit | 38e635105a658850399847aa23a5bd5325b0d616 (patch) | |
tree | 3a887091df76b6128bb120e24942b27cf14e837d /python/sandcrawler/ia.py | |
parent | 101e3c818fa5852c11003e616526726e189ab2c3 (diff) | |
download | sandcrawler-38e635105a658850399847aa23a5bd5325b0d616.tar.gz sandcrawler-38e635105a658850399847aa23a5bd5325b0d616.zip |
lots of progress on wayback refactoring
- too much to list
- canonical flags to control crawling
- cdx_to_dict helper
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r-- | python/sandcrawler/ia.py | 162 |
1 files changed, 123 insertions, 39 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 6544e58..6468743 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -67,6 +67,21 @@ def cdx_partial_from_row(full): sha1hex=full.sha1hex, ) +def cdx_to_dict(cdx): + d = { + "surt": cdx.surt, + "datetime": cdx.datetime, + "url": cdx.url, + "mimetype": cdx.mimetype, + "status_code": cdx.status_code, + "sha1b32": cdx.sha1b32, + "sha1hex": cdx.sha1hex, + } + if type(cdx) == CdxRow and '/' in cdx.warc_path: + d['warc_csize'] = cdx.warc_csize + d['warc_offset'] = cdx.warc_offset + d['warc_path'] = cdx.warc_path + return d class CdxApiError(Exception): pass @@ -98,6 +113,7 @@ class CdxApiClient: rows = [] for raw in rj[1:]: assert len(raw) == 11 # JSON is short + #print(raw, file=sys.stderr) row = CdxRow( surt=raw[0], datetime=raw[1], @@ -136,7 +152,9 @@ class CdxApiClient: raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime)) row = resp[0] if not (row.url == url and row.datetime == datetime): - raise KeyError("CDX url/datetime not found: {} {} (closest: {})".format(url, datetime, row)) + raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row)) + if filter_status_code: + assert row.status_code == filter_status_code return row def lookup_best(self, url, max_age_days=None, best_mimetype=None): @@ -209,13 +227,16 @@ class WaybackClient: # this *does* want to be http://, not https:// self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.petabox_webdata_secret = kwargs.get( + 'petabox_webdata_secret', + os.environ.get('PETABOX_WEBDATA_SECRET'), + ) self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/') self.rstore = None self.max_redirects = 25 self.wayback_endpoint = "https://web.archive.org/web/" - def fetch_petabox(self, c_size, offset, warc_path): + def fetch_petabox(self, csize, offset, warc_path): """ Fetches wayback resource directly from petabox using WARC path/offset/csize. @@ -243,7 +264,8 @@ class WaybackClient: webdata_secret=self.petabox_webdata_secret, download_base_url=self.petabox_base_url)) try: - gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) + #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) + gwb_record = self.rstore.load_resource(warc_uri, offset, csize) except wayback.exception.ResourceUnavailable: raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)") except ValueError as ve: @@ -264,14 +286,15 @@ class WaybackClient: try: body = gwb_record.open_raw_content().read() except IncompleteRead as ire: - raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + raise WaybackError( + "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) return WarcResource( status_code=status_code, location=location, body=body, ) - def fetch_petabox_body(self, c_size, offset, warc_path): + def fetch_petabox_body(self, csize, offset, warc_path): """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -279,14 +302,20 @@ class WaybackClient: Thin helper around fetch_petabox() """ - resource = self.fetch_petabox(c_size, offset, warc_path) + resource = self.fetch_petabox( + csize=csize, + offset=offset, + warc_path=warc_path, + ) if resource.status_code != 200: - raise KeyError("archived HTTP response (WARC) was not 200: {}".format(gwb_record.get_status()[0])) + raise KeyError("archived HTTP response (WARC) was not 200: {}".format( + gwb_record.get_status()[0]), + ) return resource.body - def fetch_replay_body(self, url, datetime): + def fetch_replay_body(self, url, datetime, cdx_sha1hex=None): """ Fetches an HTTP 200 record from wayback via the replay interface (web.archive.org) instead of petabox. @@ -294,9 +323,20 @@ class WaybackClient: Intended for use with SPN2 requests, where request body has not ended up in petabox yet. - TODO: is this really necessary? + If cdx_sha1hex is passed, will try to verify fetched body. Note that + this check *won't work* in many cases, due to CDX hash being of + compressed transfer data, not the uncompressed final content bytes. + + TODO: could instead try to verify that we got the expected replay body + using... new X-Archive headers? """ + + # defensively check datetime format + assert len(datetime) == 14 + assert datetime.isdigit() + try: + # TODO: don't follow redirects? resp = requests.get(self.wayback_endpoint + datetime + "id_/" + url) except requests.exceptions.TooManyRedirects: raise WaybackError("redirect loop (wayback replay fetch)") @@ -304,26 +344,46 @@ class WaybackClient: resp.raise_for_status() except Exception as e: raise WaybackError(str(e)) - # TODO: some verification here? #print(resp.url, file=sys.stderr) - #print(resp.content) + + # defensively check that this is actually correct replay based on headers + assert "X-Archive-Src" in resp.headers + assert datetime in resp.url + + if cdx_sha1hex: + # verify that body matches CDX hash + # TODO: don't need *all* these hashes, just sha1 + file_meta = gen_file_metadata(resp.content) + if cdx_sha1hex != file_meta['sha1hex']: + print("REPLAY MISMATCH: cdx:{} replay:{}".format( + cdx_sha1hex, + file_meta['sha1hex']), + file=sys.stderr) + raise WaybackError("replay fetch body didn't match CDX hash cdx:{} body:{}".format( + cdx_sha1hex, + file_meta['sha1hex']), + ) return resp.content def lookup_resource(self, start_url, best_mimetype=None): """ Looks in wayback for a resource starting at the URL, following any - redirects. Returns a ResourceResult object. + redirects. Returns a ResourceResult object, which may indicate a + failure to fetch the resource. + + Only raises exceptions on remote service failure or unexpected + problems. In a for loop: - lookup best CDX - redirect? + lookup "best" CDX + redirect status code? fetch wayback continue - good? + success (200)? fetch wayback return success - bad? + bad (other status)? return failure got to end? @@ -333,6 +393,7 @@ class WaybackClient: urls_seen = [start_url] for i in range(self.max_redirects): cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype) + #print(cdx_row, file=sys.stderr) if not cdx_row: return ResourceResult( start_url=start_url, @@ -345,7 +406,18 @@ class WaybackClient: cdx=None, ) if cdx_row.status_code == 200: - body = self.fetch_petabox_body(cdx_row.warc_csize, cdx_row.warc_offset, cdx_row.warc_path) + if '/' in cdx_row.warc_path: + body = self.fetch_petabox_body( + csize=cdx_row.warc_csize, + offset=cdx_row.warc_offset, + warc_path=cdx_row.warc_path, + ) + else: + body = self.fetch_replay_body( + url=cdx_row.url, + datetime=cdx_row.datetime, + ) + cdx_row = cdx_partial_from_row(cdx_row) return ResourceResult( start_url=start_url, hit=True, @@ -357,8 +429,14 @@ class WaybackClient: cdx=cdx_row, ) elif 300 <= cdx_row.status_code < 400: - resource = self.fetch_petabox(cdx_row.warc_csize, cdx_row.warc_offset, cdx_row.warc_path) + resource = self.fetch_petabox( + csize=cdx_row.warc_csize, + offset=cdx_row.warc_offset, + warc_path=cdx_row.warc_path, + ) assert 300 <= resource.status_code < 400 + assert resource.location + #print(resource, file=sys.stderr) next_url = resource.location if next_url in urls_seen: return ResourceResult( @@ -443,8 +521,8 @@ class SavePageNowClient: - terminal_dt: wayback timestamp of final capture - resources: list of all URLs captured - TODO: parse SPN error codes and handle better. Eg, non-200 remote - statuses, invalid hosts/URLs, timeouts, backoff, etc. + TODO: parse SPN error codes (status string) and handle better. Eg, + non-200 remote statuses, invalid hosts/URLs, timeouts, backoff, etc. """ if not (self.ia_access_key and self.ia_secret_key): raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)") @@ -486,6 +564,8 @@ class SavePageNowClient: if not final_json: raise SavePageNowError("SPN2 timed out (polling count exceeded)") + # if there was a recent crawl of same URL, fetch the status of that + # crawl to get correct datetime if final_json.get('original_job_id'): resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id'])) try: @@ -538,26 +618,30 @@ class SavePageNowClient: cdx=None, ) - # sleep one second to let CDX API update (XXX) - time.sleep(1.0) - - # fetch CDX, then body + # fetch exact CDX row cdx_row = wayback_client.cdx_client.fetch( - spn_result.terminal_url, - spn_result.terminal_dt, + url=spn_result.terminal_url, + datetime=spn_result.terminal_dt, filter_status_code=200, ) - print(spn_result, file=sys.stderr) - print(cdx_row, file=sys.stderr) - assert cdx_row.status_code == 200 - body = wayback_client.fetch_replay_body(cdx_row.url, cdx_row.datetime) - file_meta = gen_file_metadata(body) - if cdx_row.sha1hex != file_meta['sha1hex']: - print(file_meta, file=sys.stderr) - raise WaybackError("replay fetch resulted in non-matching SHA1 (vs. CDX API)") - - # not a full CDX yet - cdx_partial = cdx_partial_from_row(cdx_row) + + if '/' in cdx_row.warc_path: + # Usually can't do this kind of direct fetch because CDX result is recent/live + resource = wayback_client.fetch_petabox( + csize=cdx_row.warc_csize, + offset=cdx_row.warc_offset, + warc_path=cdx_row.warc_path, + ) + body = resource.body + else: + # note: currently not trying to verify cdx_row.sha1hex + body = wayback_client.fetch_replay_body( + url=cdx_row.url, + datetime=cdx_row.datetime, + ) + # warc_path etc will change, so strip them out + cdx_row = cdx_partial_from_row(cdx_row) + return ResourceResult( start_url=start_url, hit=True, @@ -566,6 +650,6 @@ class SavePageNowClient: terminal_dt=cdx_row.datetime, terminal_status_code=cdx_row.status_code, body=body, - cdx=cdx_partial, + cdx=cdx_row, ) |