diff options
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r-- | python/sandcrawler/ia.py | 611 |
1 files changed, 354 insertions, 257 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 8f28d42..99a7f36 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -34,50 +34,63 @@ class SandcrawlerBackoffError(Exception): be passed up through any timeout/retry code and become an actual long pause or crash. """ + pass -ResourceResult = namedtuple("ResourceResult", [ - "start_url", - "hit", - "status", - "terminal_url", - "terminal_dt", - "terminal_status_code", - "body", - "cdx", - "revisit_cdx", -]) - -WarcResource = namedtuple("WarcResource", [ - "status_code", - "location", - "body", - "revisit_cdx", -]) - -CdxRow = namedtuple('CdxRow', [ - 'surt', - 'datetime', - 'url', - 'mimetype', - 'status_code', - 'sha1b32', - 'sha1hex', - 'warc_csize', - 'warc_offset', - 'warc_path', -]) - -CdxPartial = namedtuple('CdxPartial', [ - 'surt', - 'datetime', - 'url', - 'mimetype', - 'status_code', - 'sha1b32', - 'sha1hex', -]) +ResourceResult = namedtuple( + "ResourceResult", + [ + "start_url", + "hit", + "status", + "terminal_url", + "terminal_dt", + "terminal_status_code", + "body", + "cdx", + "revisit_cdx", + ], +) + +WarcResource = namedtuple( + "WarcResource", + [ + "status_code", + "location", + "body", + "revisit_cdx", + ], +) + +CdxRow = namedtuple( + "CdxRow", + [ + "surt", + "datetime", + "url", + "mimetype", + "status_code", + "sha1b32", + "sha1hex", + "warc_csize", + "warc_offset", + "warc_path", + ], +) + +CdxPartial = namedtuple( + "CdxPartial", + [ + "surt", + "datetime", + "url", + "mimetype", + "status_code", + "sha1b32", + "sha1hex", + ], +) def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial: @@ -102,10 +115,10 @@ def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]: "sha1b32": cdx.sha1b32, "sha1hex": cdx.sha1hex, } - if type(cdx) == CdxRow and '/' in cdx.warc_path: - d['warc_csize'] = cdx.warc_csize - d['warc_offset'] = cdx.warc_offset - d['warc_path'] = cdx.warc_path + if type(cdx) == CdxRow and "/" in cdx.warc_path: + d["warc_csize"] = cdx.warc_csize + d["warc_offset"] = cdx.warc_offset + d["warc_path"] = cdx.warc_path return d @@ -116,9 +129,9 @@ def fuzzy_match_url(left: str, right: str) -> bool: """ if left == right: return True - if '://' in left and '://' in right: - left = '://'.join(left.split('://')[1:]) - right = '://'.join(right.split('://')[1:]) + if "://" in left and "://" in right: + left = "://".join(left.split("://")[1:]) + right = "://".join(right.split("://")[1:]) if left == right: return True if left == right + "/" or right == left + "/": @@ -149,14 +162,17 @@ class CdxApiClient: def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) - cdx_auth_token = kwargs.get('cdx_auth_token', os.environ.get('CDX_AUTH_TOKEN')) + cdx_auth_token = kwargs.get("cdx_auth_token", os.environ.get("CDX_AUTH_TOKEN")) if not cdx_auth_token: raise Exception( - "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)") - self.http_session.headers.update({ - 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient', - 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token), - }) + "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)" + ) + self.http_session.headers.update( + { + "User-Agent": "Mozilla/5.0 sandcrawler.CdxApiClient", + "Cookie": "cdx_auth_token={}".format(cdx_auth_token), + } + ) def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]: """ @@ -165,7 +181,7 @@ class CdxApiClient: resp = self.http_session.get(self.host_url, params=params) if resp.status_code != 200: raise CdxApiError(resp.text) - #print(resp.url, file=sys.stderr) + # print(resp.url, file=sys.stderr) if not resp.text: return None rj = resp.json() @@ -187,7 +203,7 @@ class CdxApiClient: status_code = int(raw[4]) # CDX rows with no WARC records? - if raw[8] == '-' or raw[9] == '-' or raw[10] == '-': + if raw[8] == "-" or raw[9] == "-" or raw[10] == "-": continue row = CdxRow( @@ -206,28 +222,31 @@ class CdxApiClient: rows.append(row) return rows - def fetch(self, - url: str, - datetime: str, - filter_status_code: Optional[int] = None, - retry_sleep: Optional[int] = None) -> CdxRow: + def fetch( + self, + url: str, + datetime: str, + filter_status_code: Optional[int] = None, + retry_sleep: Optional[int] = None, + ) -> CdxRow: """ Fetches a single CDX row by url/datetime. Raises a KeyError if not found, because we expect to be looking up a specific full record. """ if len(datetime) != 14: raise ValueError( - "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)) + "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime) + ) params: Dict[str, str] = { - 'url': url, - 'from': datetime, - 'to': datetime, - 'matchType': 'exact', - 'limit': "1", - 'output': 'json', + "url": url, + "from": datetime, + "to": datetime, + "matchType": "exact", + "limit": "1", + "output": "json", } if filter_status_code: - params['filter'] = "statuscode:{}".format(filter_status_code) + params["filter"] = "statuscode:{}".format(filter_status_code) resp = self._query_api(params) if not resp: if retry_sleep and retry_sleep > 0: @@ -235,37 +254,43 @@ class CdxApiClient: if retry_sleep > 3: next_sleep = retry_sleep - 3 retry_sleep = 3 - print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), - file=sys.stderr) + print( + " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), + file=sys.stderr, + ) time.sleep(retry_sleep) - return self.fetch(url, - datetime, - filter_status_code=filter_status_code, - retry_sleep=next_sleep) + return self.fetch( + url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep + ) raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime)) row = resp[0] # allow fuzzy http/https match if not (fuzzy_match_url(row.url, url) and row.datetime == datetime): if retry_sleep and retry_sleep > 0: - print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), - file=sys.stderr) + print( + " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), + file=sys.stderr, + ) time.sleep(retry_sleep) - return self.fetch(url, - datetime, - filter_status_code=filter_status_code, - retry_sleep=None) + return self.fetch( + url, datetime, filter_status_code=filter_status_code, retry_sleep=None + ) raise KeyError( "Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format( - url, datetime, row)) + url, datetime, row + ) + ) if filter_status_code: assert row.status_code == filter_status_code return row - def lookup_best(self, - url: str, - max_age_days: Optional[int] = None, - best_mimetype: Optional[str] = None, - closest: Union[datetime.datetime, str, None] = None) -> Optional[CdxRow]: + def lookup_best( + self, + url: str, + max_age_days: Optional[int] = None, + best_mimetype: Optional[str] = None, + closest: Union[datetime.datetime, str, None] = None, + ) -> Optional[CdxRow]: """ Fetches multiple CDX rows for the given URL, tries to find the most recent. @@ -289,27 +314,26 @@ class CdxApiClient: """ params: Dict[str, str] = { - 'url': url, - 'matchType': 'exact', - 'limit': "-25", - 'output': 'json', + "url": url, + "matchType": "exact", + "limit": "-25", + "output": "json", # Collapsing seems efficient, but is complex; would need to include # other filters and status code in filter #'collapse': 'timestamp:6', - # Revisits now allowed and resolved! #'filter': '!mimetype:warc/revisit', } if max_age_days: since = datetime.date.today() - datetime.timedelta(days=max_age_days) - params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day) + params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day) if closest: if isinstance(closest, datetime.datetime): - params['closest'] = '%04d%02d%02d' % (closest.year, closest.month, closest.day) + params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day) else: - params['closest'] = closest - params['sort'] = "closest" - #print(params, file=sys.stderr) + params["closest"] = closest + params["sort"] = "closest" + # print(params, file=sys.stderr) rows = self._query_api(params) if not rows: return None @@ -326,7 +350,7 @@ class CdxApiClient: int(r.mimetype == best_mimetype), int(r.mimetype != "warc/revisit"), int(r.datetime[:6]), - int('/' in r.warc_path), + int("/" in r.warc_path), int(r.datetime), ) @@ -358,25 +382,23 @@ class WaybackClient: self.cdx_client = CdxApiClient() # /serve/ instead of /download/ doesn't record view count # this *does* want to be http://, not https:// - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/") # gwb library will fall back to reading from /opt/.petabox/webdata.secret self.petabox_webdata_secret = kwargs.get( - 'petabox_webdata_secret', - os.environ.get('PETABOX_WEBDATA_SECRET'), + "petabox_webdata_secret", + os.environ.get("PETABOX_WEBDATA_SECRET"), ) - self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/') + self.warc_uri_prefix = kwargs.get("warc_uri_prefix", "https://archive.org/serve/") self.rstore = None self.max_redirects = 25 self.wayback_endpoint = "https://web.archive.org/web/" self.replay_headers = { - 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient', + "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient", } - def fetch_petabox(self, - csize: int, - offset: int, - warc_path: str, - resolve_revisit: bool = True) -> WarcResource: + def fetch_petabox( + self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True + ) -> WarcResource: """ Fetches wayback resource directly from petabox using WARC path/offset/csize. @@ -401,37 +423,49 @@ class WaybackClient: raise Exception("WaybackClient needs petabox secret to do direct WARC fetches") if "/" not in warc_path: raise ValueError( - "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)) + "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path) + ) warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore( - loaderfactory=CDXLoaderFactory3(webdata_secret=self.petabox_webdata_secret, )) + loaderfactory=CDXLoaderFactory3( + webdata_secret=self.petabox_webdata_secret, + ) + ) assert self.rstore try: - #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) + # print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) gwb_record = self.rstore.load_resource(warc_uri, offset, csize) except wayback.exception.ResourceUnavailable: print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) raise PetaboxError( - "failed to load file contents from wayback/petabox (ResourceUnavailable)") + "failed to load file contents from wayback/petabox (ResourceUnavailable)" + ) except wayback.exception.InvalidResource: print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) raise WaybackContentError( - "failed to load file contents from wayback/petabox (InvalidResource)") + "failed to load file contents from wayback/petabox (InvalidResource)" + ) except urllib3.exceptions.ReadTimeoutError as rte: raise PetaboxError( - "failed to load file contents from wayback/petabox (ReadTimeoutError: {})". - format(rte)) + "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format( + rte + ) + ) except ValueError as ve: raise PetaboxError( - "failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + "failed to load file contents from wayback/petabox (ValueError: {})".format(ve) + ) except EOFError as eofe: raise PetaboxError( - "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe) + ) except TypeError as te: raise PetaboxError( - "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)" - .format(te)) + "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( + te + ) + ) except Exception as e: if "while decompressing data: invalid block type" in str(e): raise PetaboxError( @@ -449,8 +483,11 @@ class WaybackClient: raise WaybackContentError("too many HTTP headers (in wayback fetch)") location = gwb_record.get_location() or None - if status_code is None and gwb_record.target_uri.startswith( - b"ftp://") and not gwb_record.is_revisit(): + if ( + status_code is None + and gwb_record.target_uri.startswith(b"ftp://") + and not gwb_record.is_revisit() + ): # TODO: some additional verification here? status_code = 226 @@ -463,17 +500,19 @@ class WaybackClient: if not (revisit_uri and revisit_dt): raise WaybackContentError( "revisit record missing URI and/or DT: warc:{} offset:{}".format( - warc_path, offset)) + warc_path, offset + ) + ) # convert revisit_dt # len("2018-07-24T11:56:49"), or with "Z" assert len(revisit_dt) in (19, 20) if type(revisit_uri) is bytes: - revisit_uri = revisit_uri.decode('utf-8') + revisit_uri = revisit_uri.decode("utf-8") if type(revisit_dt) is bytes: - revisit_dt = revisit_dt.decode('utf-8') - revisit_dt = revisit_dt.replace('-', '').replace(':', - '').replace('T', - '').replace('Z', '') + revisit_dt = revisit_dt.decode("utf-8") + revisit_dt = ( + revisit_dt.replace("-", "").replace(":", "").replace("T", "").replace("Z", "") + ) assert len(revisit_dt) == 14 try: revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) @@ -491,8 +530,10 @@ class WaybackClient: body = gwb_record.open_raw_content().read() except IncompleteRead as ire: raise WaybackError( - "failed to read actual file contents from wayback/petabox (IncompleteRead: {})" - .format(ire)) + "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( + ire + ) + ) elif status_code is None: raise WaybackContentError("got a None status_code in (W)ARC record") return WarcResource( @@ -502,12 +543,14 @@ class WaybackClient: revisit_cdx=revisit_cdx, ) - def fetch_petabox_body(self, - csize: int, - offset: int, - warc_path: str, - resolve_revisit: bool = True, - expected_status_code: Optional[int] = None) -> bytes: + def fetch_petabox_body( + self, + csize: int, + offset: int, + warc_path: str, + resolve_revisit: bool = True, + expected_status_code: Optional[int] = None, + ) -> bytes: """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -524,20 +567,22 @@ class WaybackClient: if expected_status_code: if expected_status_code != resource.status_code: - raise KeyError("archived HTTP response (WARC) was not {}: {}".format( - expected_status_code, - resource.status_code, - )) + raise KeyError( + "archived HTTP response (WARC) was not {}: {}".format( + expected_status_code, + resource.status_code, + ) + ) elif resource.status_code not in (200, 226): - raise KeyError("archived HTTP response (WARC) was not 200: {}".format( - resource.status_code)) + raise KeyError( + "archived HTTP response (WARC) was not 200: {}".format(resource.status_code) + ) return resource.body - def fetch_replay_body(self, - url: str, - datetime: str, - cdx_sha1hex: Optional[str] = None) -> bytes: + def fetch_replay_body( + self, url: str, datetime: str, cdx_sha1hex: Optional[str] = None + ) -> bytes: """ Fetches an HTTP 200 record from wayback via the replay interface (web.archive.org) instead of petabox. @@ -570,32 +615,42 @@ class WaybackClient: except UnicodeDecodeError: raise WaybackContentError( "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format( - url)) + url + ) + ) try: resp.raise_for_status() except Exception as e: raise WaybackError(str(e)) - #print(resp.url, file=sys.stderr) + # print(resp.url, file=sys.stderr) # defensively check that this is actually correct replay based on headers if "X-Archive-Src" not in resp.headers: raise WaybackError("replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: - raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format( - datetime, resp.url)) + raise WaybackError( + "didn't get exact reply (redirect?) datetime:{} got:{}".format( + datetime, resp.url + ) + ) if cdx_sha1hex: # verify that body matches CDX hash # TODO: don't need *all* these hashes, just sha1 file_meta = gen_file_metadata(resp.content) - if cdx_sha1hex != file_meta['sha1hex']: - print(" REPLAY MISMATCH: cdx:{} replay:{}".format(cdx_sha1hex, - file_meta['sha1hex']), - file=sys.stderr) + if cdx_sha1hex != file_meta["sha1hex"]: + print( + " REPLAY MISMATCH: cdx:{} replay:{}".format( + cdx_sha1hex, file_meta["sha1hex"] + ), + file=sys.stderr, + ) raise WaybackContentError( "replay fetch body didn't match CDX hash cdx:{} body:{}".format( - cdx_sha1hex, file_meta['sha1hex']), ) + cdx_sha1hex, file_meta["sha1hex"] + ), + ) return resp.content def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]: @@ -625,37 +680,44 @@ class WaybackClient: except UnicodeDecodeError: raise WaybackContentError( "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format( - url)) + url + ) + ) try: resp.raise_for_status() except Exception as e: raise WaybackError(str(e)) - #print(resp.url, file=sys.stderr) + # print(resp.url, file=sys.stderr) # defensively check that this is actually correct replay based on headers # previously check for "X-Archive-Redirect-Reason" here if "X-Archive-Src" not in resp.headers: raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: - raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format( - datetime, resp.url)) + raise WaybackError( + "didn't get exact reply (redirect?) datetime:{} got:{}".format( + datetime, resp.url + ) + ) redirect_url = resp.headers.get("Location") # eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw - #print(redirect_url, file=sys.stderr) + # print(redirect_url, file=sys.stderr) if redirect_url and redirect_url.startswith("https://web.archive.org/web/"): redirect_url = "/".join(redirect_url.split("/")[5:]) - #print(redirect_url, file=sys.stderr) + # print(redirect_url, file=sys.stderr) if redirect_url and redirect_url.startswith("http"): redirect_url = clean_url(redirect_url) return redirect_url else: return None - def lookup_resource(self, - start_url: str, - best_mimetype: Optional[str] = None, - closest: Union[str, datetime.datetime, None] = None) -> ResourceResult: + def lookup_resource( + self, + start_url: str, + best_mimetype: Optional[str] = None, + closest: Union[str, datetime.datetime, None] = None, + ) -> ResourceResult: """ Looks in wayback for a resource starting at the URL, following any redirects. Returns a ResourceResult object, which may indicate a @@ -684,8 +746,9 @@ class WaybackClient: for i in range(self.max_redirects + 1): print(" URL: {}".format(next_url), file=sys.stderr) next_row: Optional[CdxRow] = self.cdx_client.lookup_best( - next_url, best_mimetype=best_mimetype, closest=closest) - #print(next_row, file=sys.stderr) + next_url, best_mimetype=best_mimetype, closest=closest + ) + # print(next_row, file=sys.stderr) if not next_row: return ResourceResult( start_url=start_url, @@ -702,7 +765,7 @@ class WaybackClient: cdx_row: CdxRow = next_row # first try straight-forward redirect situation - if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path: + if cdx_row.mimetype == "warc/revisit" and "/" in cdx_row.warc_path: resource = self.fetch_petabox( csize=cdx_row.warc_csize, offset=cdx_row.warc_offset, @@ -725,7 +788,7 @@ class WaybackClient: if cdx_row.status_code in (200, 226): revisit_cdx = None final_cdx: Union[CdxRow, CdxPartial] = cdx_row - if '/' in cdx_row.warc_path: + if "/" in cdx_row.warc_path: resource = self.fetch_petabox( csize=cdx_row.warc_csize, offset=cdx_row.warc_offset, @@ -751,7 +814,7 @@ class WaybackClient: revisit_cdx=revisit_cdx, ) elif 300 <= (cdx_row.status_code or 0) < 400: - if '/' in cdx_row.warc_path: + if "/" in cdx_row.warc_path: resource = self.fetch_petabox( csize=cdx_row.warc_csize, offset=cdx_row.warc_offset, @@ -848,34 +911,39 @@ class SavePageNowBackoffError(SandcrawlerBackoffError): pass -SavePageNowResult = namedtuple('SavePageNowResult', [ - 'success', - 'status', - 'job_id', - 'request_url', - 'terminal_url', - 'terminal_dt', - 'resources', -]) +SavePageNowResult = namedtuple( + "SavePageNowResult", + [ + "success", + "status", + "job_id", + "request_url", + "terminal_url", + "terminal_dt", + "resources", + ], +) class SavePageNowClient: def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs): - self.ia_access_key = kwargs.get('ia_access_key', os.environ.get('IA_ACCESS_KEY')) - self.ia_secret_key = kwargs.get('ia_secret_key', os.environ.get('IA_SECRET_KEY')) + self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY")) + self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY")) self.v2endpoint = v2endpoint self.v2_session = requests_retry_session(retries=5, backoff_factor=3) - self.v2_session.headers.update({ - 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient', - 'Accept': 'application/json', - 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key), - }) + self.v2_session.headers.update( + { + "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient", + "Accept": "application/json", + "Authorization": "LOW {}:{}".format(self.ia_access_key, self.ia_secret_key), + } + ) # 3 minutes total self.poll_count = 60 self.poll_seconds = 3.0 - self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0) + self.spn_cdx_retry_sec = kwargs.get("spn_cdx_retry_sec", 9.0) # these are special-case web domains for which we want SPN2 to not run # a headless browser (brozzler), but instead simply run wget. @@ -888,20 +956,20 @@ class SavePageNowClient: "://europepmc.org/backend/ptpmcrender.fcgi", "://pdfs.semanticscholar.org/", "://res.mdpi.com/", - # platform sites "://zenodo.org/", "://figshare.org/", "://springernature.figshare.com/", - # popular simple cloud storage or direct links "://s3-eu-west-1.amazonaws.com/", ] - def save_url_now_v2(self, - request_url: str, - force_simple_get: Optional[int] = None, - capture_outlinks: int = 0) -> SavePageNowResult: + def save_url_now_v2( + self, + request_url: str, + force_simple_get: Optional[int] = None, + capture_outlinks: int = 0, + ) -> SavePageNowResult: """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -944,33 +1012,39 @@ class SavePageNowClient: resp = self.v2_session.post( self.v2endpoint, data={ - 'url': request_url, - 'capture_all': 1, - 'capture_outlinks': capture_outlinks, - 'capture_screenshot': 0, - 'if_not_archived_within': '1d', - 'force_get': force_simple_get, - 'skip_first_archive': 1, - 'outlinks_availability': 0, - 'js_behavior_timeout': 0, + "url": request_url, + "capture_all": 1, + "capture_outlinks": capture_outlinks, + "capture_screenshot": 0, + "if_not_archived_within": "1d", + "force_get": force_simple_get, + "skip_first_archive": 1, + "outlinks_availability": 0, + "js_behavior_timeout": 0, }, ) if resp.status_code == 429: - raise SavePageNowBackoffError("status_code: {}, url: {}".format( - resp.status_code, request_url)) + raise SavePageNowBackoffError( + "status_code: {}, url: {}".format(resp.status_code, request_url) + ) elif resp.status_code != 200: - raise SavePageNowError("SPN2 status_code: {}, url: {}".format( - resp.status_code, request_url)) + raise SavePageNowError( + "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url) + ) resp_json = resp.json() - if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json[ - 'message']: - raise SavePageNowBackoffError(resp_json['message']) - elif not resp_json or 'job_id' not in resp_json or not resp_json['job_id']: + if ( + resp_json + and "message" in resp_json + and "You have already reached the limit of active sessions" in resp_json["message"] + ): + raise SavePageNowBackoffError(resp_json["message"]) + elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]: raise SavePageNowError( - "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)) + "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json) + ) - job_id = resp_json['job_id'] + job_id = resp_json["job_id"] print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr) # poll until complete @@ -981,53 +1055,59 @@ class SavePageNowClient: resp.raise_for_status() except Exception: raise SavePageNowError(resp.content) - status = resp.json()['status'] - if status == 'pending': + status = resp.json()["status"] + if status == "pending": time.sleep(self.poll_seconds) - elif status in ('success', 'error'): + elif status in ("success", "error"): final_json = resp.json() break else: - raise SavePageNowError("Unknown SPN2 status:{} url:{}".format( - status, request_url)) + raise SavePageNowError( + "Unknown SPN2 status:{} url:{}".format(status, request_url) + ) if not final_json: raise SavePageNowError("SPN2 timed out (polling count exceeded)") # if there was a recent crawl of same URL, fetch the status of that # crawl to get correct datetime - if final_json.get('original_job_id'): - print(f" SPN recent capture: {job_id} -> {final_json['original_job_id']}", - file=sys.stderr) - resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, - final_json['original_job_id'])) + if final_json.get("original_job_id"): + print( + f" SPN recent capture: {job_id} -> {final_json['original_job_id']}", + file=sys.stderr, + ) + resp = self.v2_session.get( + "{}/status/{}".format(self.v2endpoint, final_json["original_job_id"]) + ) try: resp.raise_for_status() except Exception: raise SavePageNowError(resp.content) final_json = resp.json() - #print(final_json, file=sys.stderr) + # print(final_json, file=sys.stderr) - if final_json['status'] == "success": - if final_json.get('original_url').startswith('/'): - print(f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}", - file=sys.stderr) + if final_json["status"] == "success": + if final_json.get("original_url").startswith("/"): + print( + f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}", + file=sys.stderr, + ) return SavePageNowResult( True, "success", job_id, request_url, - final_json['original_url'], - final_json['timestamp'], - final_json['resources'], + final_json["original_url"], + final_json["timestamp"], + final_json["resources"], ) else: - if final_json['status'] == 'pending': - final_json['status'] = 'error:pending' + if final_json["status"] == "pending": + final_json["status"] = "error:pending" return SavePageNowResult( False, - final_json.get('status_ext') or final_json['status'], + final_json.get("status_ext") or final_json["status"], job_id, request_url, None, @@ -1035,10 +1115,12 @@ class SavePageNowClient: None, ) - def crawl_resource(self, - start_url: str, - wayback_client: WaybackClient, - force_simple_get: Optional[int] = None) -> ResourceResult: + def crawl_resource( + self, + start_url: str, + wayback_client: WaybackClient, + force_simple_get: Optional[int] = None, + ) -> ResourceResult: """ Runs a SPN2 crawl, then fetches body. @@ -1048,18 +1130,23 @@ class SavePageNowClient: """ # HACK: capture CNKI domains with outlinks (for COVID-19 crawling) - if 'gzbd.cnki.net/' in start_url: - spn_result = self.save_url_now_v2(start_url, - force_simple_get=force_simple_get, - capture_outlinks=1) + if "gzbd.cnki.net/" in start_url: + spn_result = self.save_url_now_v2( + start_url, force_simple_get=force_simple_get, capture_outlinks=1 + ) else: spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get) if not spn_result.success: status = spn_result.status - if status in ("error:invalid-url", "error:not-found", - "error:invalid-host-resolution", "error:gateway-timeout", - "error:too-many-redirects", "error:read-timeout"): + if status in ( + "error:invalid-url", + "error:not-found", + "error:invalid-host-resolution", + "error:gateway-timeout", + "error:too-many-redirects", + "error:read-timeout", + ): status = status.replace("error:", "") elif status in ("error:no-access", "error:forbidden"): status = "forbidden" @@ -1070,8 +1157,10 @@ class SavePageNowClient: elif status.startswith("error:"): status = "spn2-" + status # despite other errors, call these a failure (so we don't retry) - if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') - or spn_result.terminal_url.endswith("cookieSet=1")): + if spn_result.terminal_url and ( + spn_result.terminal_url.endswith("/cookieAbsent") + or spn_result.terminal_url.endswith("cookieSet=1") + ): status = "blocked-cookie" return ResourceResult( start_url=start_url, @@ -1084,10 +1173,10 @@ class SavePageNowClient: cdx=None, revisit_cdx=None, ) - #print(spn_result, file=sys.stderr) + # print(spn_result, file=sys.stderr) # detect partial URL response (aka, success, but missing full URL) - if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith('/'): + if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith("/"): return ResourceResult( start_url=start_url, hit=False, @@ -1102,7 +1191,8 @@ class SavePageNowClient: # don't try to CDX fetch for this common cookie block terminal if spn_result.terminal_url.endswith( - '/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"): + "/cookieAbsent" + ) or spn_result.terminal_url.endswith("cookieSet=1"): return ResourceResult( start_url=start_url, hit=False, @@ -1127,7 +1217,7 @@ class SavePageNowClient: cdx_row = elsevier_pdf_cdx else: print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) - #print(elsevier_pdf_cdx, file=sys.stderr) + # print(elsevier_pdf_cdx, file=sys.stderr) if not cdx_row: # lookup exact @@ -1164,11 +1254,11 @@ class SavePageNowClient: revisit_cdx=None, ) - #print(cdx_row, file=sys.stderr) + # print(cdx_row, file=sys.stderr) revisit_cdx = None final_cdx: Union[CdxRow, CdxPartial] = cdx_row - if '/' in cdx_row.warc_path: + if "/" in cdx_row.warc_path: # Usually can't do this kind of direct fetch because CDX result is recent/live resource = wayback_client.fetch_petabox( csize=cdx_row.warc_csize, @@ -1228,12 +1318,19 @@ class SavePageNowClient: ) -def fix_transfer_encoding(file_meta: dict, - resource: ResourceResult) -> Tuple[dict, ResourceResult]: - if resource.body and file_meta[ - 'mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': - print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), - file=sys.stderr) +def fix_transfer_encoding( + file_meta: dict, resource: ResourceResult +) -> Tuple[dict, ResourceResult]: + if ( + resource.body + and file_meta["mimetype"] == "application/gzip" + and resource.cdx + and resource.cdx.mimetype != "application/gzip" + ): + print( + " transfer encoding not stripped: {}".format(resource.cdx.mimetype), + file=sys.stderr, + ) inner_body = gzip.decompress(resource.body) if not inner_body: raise Exception("null body inside transfer encoding") |