diff options
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r-- | python/sandcrawler/ia.py | 159 |
1 files changed, 126 insertions, 33 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index dc9aae5..3ab4971 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -136,6 +136,8 @@ def fuzzy_match_url(left: str, right: str) -> bool: return True if left == right + "/" or right == left + "/": return True + if left.replace("//", "/") == right.replace("//", "/"): + return True return False @@ -147,6 +149,13 @@ def test_fuzzy_match_url() -> None: assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False + assert ( + fuzzy_match_url( + "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png", + "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png", + ) + is True + ) # should probably handle these? assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False @@ -202,10 +211,19 @@ class CdxApiClient: else: status_code = int(raw[4]) - # CDX rows with no WARC records? + # remove CDX rows with no WARC records (?) if raw[8] == "-" or raw[9] == "-" or raw[10] == "-": continue + # remove CDX rows with SHA256 (not SHA1) digests + if raw[5].startswith("sha-256"): + continue + + # remove CDX rows with 'error' digests + # TODO: follow-up on this (2022-11-01 sandcrawler errors) + if raw[5].lower() == "error": + continue + row = CdxRow( surt=raw[0], datetime=raw[1], @@ -316,7 +334,7 @@ class CdxApiClient: params: Dict[str, str] = { "url": url, "matchType": "exact", - "limit": "-25", + "limit": "-40", "output": "json", # Collapsing seems efficient, but is complex; would need to include # other filters and status code in filter @@ -327,11 +345,14 @@ class CdxApiClient: if max_age_days: since = datetime.date.today() - datetime.timedelta(days=max_age_days) params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day) + closest_dt = "00000000" if closest: if isinstance(closest, datetime.datetime): - params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day) + closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day) + params["closest"] = closest_dt else: - params["closest"] = closest + closest_dt = closest + params["closest"] = closest_dt params["sort"] = "closest" # print(params, file=sys.stderr) rows = self._query_api(params) @@ -345,13 +366,15 @@ class CdxApiClient: *reverse* order. """ return ( + int(r.url == url), int(r.status_code in (200, 226)), int(0 - (r.status_code or 999)), int(r.mimetype == best_mimetype), int(r.mimetype != "warc/revisit"), - int(r.datetime[:6]), - int("/" in r.warc_path), + r.datetime[:4] == closest_dt[:4], int(r.datetime), + # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime + int("/" in r.warc_path), ) rows = sorted(rows, key=_cdx_sort_key) @@ -396,6 +419,9 @@ class WaybackClient: "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient", } self.http_session = requests_retry_session() + self.record_http_session = requests_retry_session( + status_forcelist=[], + ) def fetch_petabox( self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True @@ -604,13 +630,15 @@ class WaybackClient: assert datetime.isdigit() try: - resp = self.http_session.get( + resp = self.record_http_session.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, headers=self.replay_headers, ) except requests.exceptions.TooManyRedirects: raise WaybackContentError("redirect loop (wayback replay fetch)") + except requests.exceptions.ConnectionError: + raise WaybackContentError("ConnectionError (wayback replay fetch)") except requests.exceptions.ChunkedEncodingError: raise WaybackError("ChunkedEncodingError (wayback replay fetch)") except UnicodeDecodeError: @@ -620,14 +648,14 @@ class WaybackClient: ) ) - try: - resp.raise_for_status() - except Exception as e: - raise WaybackError(str(e)) - # print(resp.url, file=sys.stderr) - # defensively check that this is actually correct replay based on headers if "X-Archive-Src" not in resp.headers: + # check if this was an error first + try: + resp.raise_for_status() + except Exception as e: + raise WaybackError(str(e)) + # otherwise, a weird case (200/redirect but no Src header raise WaybackError("replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: raise WaybackError( @@ -671,11 +699,18 @@ class WaybackClient: assert datetime.isdigit() try: - resp = self.http_session.get( + # when fetching via `id_`, it is possible to get a 5xx error which + # is either a wayback error, or an actual replay of an upstream 5xx + # error. the exception control flow here is tweaked, and a + # different HTTP session is used, to try and differentiate between + # the two cases + resp = None + resp = self.record_http_session.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, headers=self.replay_headers, ) + resp.raise_for_status() except requests.exceptions.TooManyRedirects: raise WaybackContentError("redirect loop (wayback replay fetch)") except UnicodeDecodeError: @@ -684,15 +719,19 @@ class WaybackClient: url ) ) - try: - resp.raise_for_status() except Exception as e: + if resp is not None and "X-Archive-Src" in resp.headers: + raise WaybackContentError( + f"expected redirect record but got captured HTTP status: {resp.status_code}" + ) raise WaybackError(str(e)) - # print(resp.url, file=sys.stderr) # defensively check that this is actually correct replay based on headers # previously check for "X-Archive-Redirect-Reason" here - if "X-Archive-Src" not in resp.headers: + if ( + "X-Archive-Src" not in resp.headers + and "X-Archive-Redirect-Reason" not in resp.headers + ): raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: raise WaybackError( @@ -931,7 +970,9 @@ class SavePageNowClient: self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY")) self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY")) self.v2endpoint = v2endpoint - self.v2_session = requests_retry_session(retries=5, backoff_factor=3) + self.v2_session = requests_retry_session( + retries=5, backoff_factor=3, status_forcelist=[502, 504] + ) self.v2_session.headers.update( { "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient", @@ -1010,20 +1051,46 @@ class SavePageNowClient: if domain in request_url: force_simple_get = 1 break - resp = self.v2_session.post( - self.v2endpoint, - data={ - "url": request_url, - "capture_all": 1, - "capture_outlinks": capture_outlinks, - "capture_screenshot": 0, - "if_not_archived_within": "1d", - "force_get": force_simple_get, - "skip_first_archive": 1, - "outlinks_availability": 0, - "js_behavior_timeout": 0, - }, - ) + + # check if SPNv2 user has capacity available + resp = self.v2_session.get(f"{self.v2endpoint}/status/user") + if resp.status_code == 429: + raise SavePageNowBackoffError( + f"SPNv2 availability API status_code: {resp.status_code}" + ) + elif resp.status_code != 200: + raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}") + resp.raise_for_status() + status_user = resp.json() + if status_user["available"] <= 1: + print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr) + raise SavePageNowBackoffError( + "SPNv2 availability: {}, url: {}".format(status_user, request_url) + ) + + req_data = { + "url": request_url, + "capture_all": 1, + "if_not_archived_within": "1d", + "skip_first_archive": 1, + "js_behavior_timeout": 0, + # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API + # implementation + # "capture_screenshot": 0, + # "outlinks_availability": 0, + } + if force_simple_get: + req_data["force_get"] = force_simple_get + if capture_outlinks: + req_data["capture_outlinks"] = capture_outlinks + try: + resp = self.v2_session.post( + self.v2endpoint, + data=req_data, + ) + except requests.exceptions.ConnectionError: + raise SavePageNowError(f"SPN2 TCP connection error {request_url=}") + if resp.status_code == 429: raise SavePageNowBackoffError( "status_code: {}, url: {}".format(resp.status_code, request_url) @@ -1032,6 +1099,7 @@ class SavePageNowClient: raise SavePageNowError( "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url) ) + resp.raise_for_status() resp_json = resp.json() if ( @@ -1040,6 +1108,30 @@ class SavePageNowClient: and "You have already reached the limit of active sessions" in resp_json["message"] ): raise SavePageNowBackoffError(resp_json["message"]) + elif ( + resp_json + and "message" in resp_json + and "The same snapshot had been made" in resp_json["message"] + ): + return SavePageNowResult( + False, + "spn2-recent-capture", + None, + request_url, + None, + None, + None, + ) + elif resp_json.get("status") == "error": + return SavePageNowResult( + False, + resp_json.get("status_ext") or resp_json["status"], + None, + request_url, + None, + None, + None, + ) elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]: raise SavePageNowError( "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json) @@ -1047,6 +1139,7 @@ class SavePageNowClient: job_id = resp_json["job_id"] print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr) + time.sleep(0.1) # poll until complete final_json = None |