aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py159
1 files changed, 126 insertions, 33 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index dc9aae5..3ab4971 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -136,6 +136,8 @@ def fuzzy_match_url(left: str, right: str) -> bool:
return True
if left == right + "/" or right == left + "/":
return True
+ if left.replace("//", "/") == right.replace("//", "/"):
+ return True
return False
@@ -147,6 +149,13 @@ def test_fuzzy_match_url() -> None:
assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
+ assert (
+ fuzzy_match_url(
+ "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png",
+ "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png",
+ )
+ is True
+ )
# should probably handle these?
assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
@@ -202,10 +211,19 @@ class CdxApiClient:
else:
status_code = int(raw[4])
- # CDX rows with no WARC records?
+ # remove CDX rows with no WARC records (?)
if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
continue
+ # remove CDX rows with SHA256 (not SHA1) digests
+ if raw[5].startswith("sha-256"):
+ continue
+
+ # remove CDX rows with 'error' digests
+ # TODO: follow-up on this (2022-11-01 sandcrawler errors)
+ if raw[5].lower() == "error":
+ continue
+
row = CdxRow(
surt=raw[0],
datetime=raw[1],
@@ -316,7 +334,7 @@ class CdxApiClient:
params: Dict[str, str] = {
"url": url,
"matchType": "exact",
- "limit": "-25",
+ "limit": "-40",
"output": "json",
# Collapsing seems efficient, but is complex; would need to include
# other filters and status code in filter
@@ -327,11 +345,14 @@ class CdxApiClient:
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
+ closest_dt = "00000000"
if closest:
if isinstance(closest, datetime.datetime):
- params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ params["closest"] = closest_dt
else:
- params["closest"] = closest
+ closest_dt = closest
+ params["closest"] = closest_dt
params["sort"] = "closest"
# print(params, file=sys.stderr)
rows = self._query_api(params)
@@ -345,13 +366,15 @@ class CdxApiClient:
*reverse* order.
"""
return (
+ int(r.url == url),
int(r.status_code in (200, 226)),
int(0 - (r.status_code or 999)),
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
- int(r.datetime[:6]),
- int("/" in r.warc_path),
+ r.datetime[:4] == closest_dt[:4],
int(r.datetime),
+ # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime
+ int("/" in r.warc_path),
)
rows = sorted(rows, key=_cdx_sort_key)
@@ -396,6 +419,9 @@ class WaybackClient:
"User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
}
self.http_session = requests_retry_session()
+ self.record_http_session = requests_retry_session(
+ status_forcelist=[],
+ )
def fetch_petabox(
self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
@@ -604,13 +630,15 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = self.http_session.get(
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ConnectionError:
+ raise WaybackContentError("ConnectionError (wayback replay fetch)")
except requests.exceptions.ChunkedEncodingError:
raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
except UnicodeDecodeError:
@@ -620,14 +648,14 @@ class WaybackClient:
)
)
- try:
- resp.raise_for_status()
- except Exception as e:
- raise WaybackError(str(e))
- # print(resp.url, file=sys.stderr)
-
# defensively check that this is actually correct replay based on headers
if "X-Archive-Src" not in resp.headers:
+ # check if this was an error first
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # otherwise, a weird case (200/redirect but no Src header
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
raise WaybackError(
@@ -671,11 +699,18 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = self.http_session.get(
+ # when fetching via `id_`, it is possible to get a 5xx error which
+ # is either a wayback error, or an actual replay of an upstream 5xx
+ # error. the exception control flow here is tweaked, and a
+ # different HTTP session is used, to try and differentiate between
+ # the two cases
+ resp = None
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
+ resp.raise_for_status()
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
except UnicodeDecodeError:
@@ -684,15 +719,19 @@ class WaybackClient:
url
)
)
- try:
- resp.raise_for_status()
except Exception as e:
+ if resp is not None and "X-Archive-Src" in resp.headers:
+ raise WaybackContentError(
+ f"expected redirect record but got captured HTTP status: {resp.status_code}"
+ )
raise WaybackError(str(e))
- # print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
- if "X-Archive-Src" not in resp.headers:
+ if (
+ "X-Archive-Src" not in resp.headers
+ and "X-Archive-Redirect-Reason" not in resp.headers
+ ):
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
raise WaybackError(
@@ -931,7 +970,9 @@ class SavePageNowClient:
self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY"))
self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY"))
self.v2endpoint = v2endpoint
- self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
+ self.v2_session = requests_retry_session(
+ retries=5, backoff_factor=3, status_forcelist=[502, 504]
+ )
self.v2_session.headers.update(
{
"User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient",
@@ -1010,20 +1051,46 @@ class SavePageNowClient:
if domain in request_url:
force_simple_get = 1
break
- resp = self.v2_session.post(
- self.v2endpoint,
- data={
- "url": request_url,
- "capture_all": 1,
- "capture_outlinks": capture_outlinks,
- "capture_screenshot": 0,
- "if_not_archived_within": "1d",
- "force_get": force_simple_get,
- "skip_first_archive": 1,
- "outlinks_availability": 0,
- "js_behavior_timeout": 0,
- },
- )
+
+ # check if SPNv2 user has capacity available
+ resp = self.v2_session.get(f"{self.v2endpoint}/status/user")
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ f"SPNv2 availability API status_code: {resp.status_code}"
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}")
+ resp.raise_for_status()
+ status_user = resp.json()
+ if status_user["available"] <= 1:
+ print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr)
+ raise SavePageNowBackoffError(
+ "SPNv2 availability: {}, url: {}".format(status_user, request_url)
+ )
+
+ req_data = {
+ "url": request_url,
+ "capture_all": 1,
+ "if_not_archived_within": "1d",
+ "skip_first_archive": 1,
+ "js_behavior_timeout": 0,
+ # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API
+ # implementation
+ # "capture_screenshot": 0,
+ # "outlinks_availability": 0,
+ }
+ if force_simple_get:
+ req_data["force_get"] = force_simple_get
+ if capture_outlinks:
+ req_data["capture_outlinks"] = capture_outlinks
+ try:
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data=req_data,
+ )
+ except requests.exceptions.ConnectionError:
+ raise SavePageNowError(f"SPN2 TCP connection error {request_url=}")
+
if resp.status_code == 429:
raise SavePageNowBackoffError(
"status_code: {}, url: {}".format(resp.status_code, request_url)
@@ -1032,6 +1099,7 @@ class SavePageNowClient:
raise SavePageNowError(
"SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
)
+ resp.raise_for_status()
resp_json = resp.json()
if (
@@ -1040,6 +1108,30 @@ class SavePageNowClient:
and "You have already reached the limit of active sessions" in resp_json["message"]
):
raise SavePageNowBackoffError(resp_json["message"])
+ elif (
+ resp_json
+ and "message" in resp_json
+ and "The same snapshot had been made" in resp_json["message"]
+ ):
+ return SavePageNowResult(
+ False,
+ "spn2-recent-capture",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif resp_json.get("status") == "error":
+ return SavePageNowResult(
+ False,
+ resp_json.get("status_ext") or resp_json["status"],
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]:
raise SavePageNowError(
"Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)
@@ -1047,6 +1139,7 @@ class SavePageNowClient:
job_id = resp_json["job_id"]
print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+ time.sleep(0.1)
# poll until complete
final_json = None