aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py611
1 files changed, 354 insertions, 257 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 8f28d42..99a7f36 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -34,50 +34,63 @@ class SandcrawlerBackoffError(Exception):
be passed up through any timeout/retry code and become an actual long pause
or crash.
"""
+
pass
-ResourceResult = namedtuple("ResourceResult", [
- "start_url",
- "hit",
- "status",
- "terminal_url",
- "terminal_dt",
- "terminal_status_code",
- "body",
- "cdx",
- "revisit_cdx",
-])
-
-WarcResource = namedtuple("WarcResource", [
- "status_code",
- "location",
- "body",
- "revisit_cdx",
-])
-
-CdxRow = namedtuple('CdxRow', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
- 'warc_csize',
- 'warc_offset',
- 'warc_path',
-])
-
-CdxPartial = namedtuple('CdxPartial', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
-])
+ResourceResult = namedtuple(
+ "ResourceResult",
+ [
+ "start_url",
+ "hit",
+ "status",
+ "terminal_url",
+ "terminal_dt",
+ "terminal_status_code",
+ "body",
+ "cdx",
+ "revisit_cdx",
+ ],
+)
+
+WarcResource = namedtuple(
+ "WarcResource",
+ [
+ "status_code",
+ "location",
+ "body",
+ "revisit_cdx",
+ ],
+)
+
+CdxRow = namedtuple(
+ "CdxRow",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ "warc_csize",
+ "warc_offset",
+ "warc_path",
+ ],
+)
+
+CdxPartial = namedtuple(
+ "CdxPartial",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ ],
+)
def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial:
@@ -102,10 +115,10 @@ def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]:
"sha1b32": cdx.sha1b32,
"sha1hex": cdx.sha1hex,
}
- if type(cdx) == CdxRow and '/' in cdx.warc_path:
- d['warc_csize'] = cdx.warc_csize
- d['warc_offset'] = cdx.warc_offset
- d['warc_path'] = cdx.warc_path
+ if type(cdx) == CdxRow and "/" in cdx.warc_path:
+ d["warc_csize"] = cdx.warc_csize
+ d["warc_offset"] = cdx.warc_offset
+ d["warc_path"] = cdx.warc_path
return d
@@ -116,9 +129,9 @@ def fuzzy_match_url(left: str, right: str) -> bool:
"""
if left == right:
return True
- if '://' in left and '://' in right:
- left = '://'.join(left.split('://')[1:])
- right = '://'.join(right.split('://')[1:])
+ if "://" in left and "://" in right:
+ left = "://".join(left.split("://")[1:])
+ right = "://".join(right.split("://")[1:])
if left == right:
return True
if left == right + "/" or right == left + "/":
@@ -149,14 +162,17 @@ class CdxApiClient:
def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- cdx_auth_token = kwargs.get('cdx_auth_token', os.environ.get('CDX_AUTH_TOKEN'))
+ cdx_auth_token = kwargs.get("cdx_auth_token", os.environ.get("CDX_AUTH_TOKEN"))
if not cdx_auth_token:
raise Exception(
- "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)")
- self.http_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
- 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
- })
+ "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)"
+ )
+ self.http_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.CdxApiClient",
+ "Cookie": "cdx_auth_token={}".format(cdx_auth_token),
+ }
+ )
def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]:
"""
@@ -165,7 +181,7 @@ class CdxApiClient:
resp = self.http_session.get(self.host_url, params=params)
if resp.status_code != 200:
raise CdxApiError(resp.text)
- #print(resp.url, file=sys.stderr)
+ # print(resp.url, file=sys.stderr)
if not resp.text:
return None
rj = resp.json()
@@ -187,7 +203,7 @@ class CdxApiClient:
status_code = int(raw[4])
# CDX rows with no WARC records?
- if raw[8] == '-' or raw[9] == '-' or raw[10] == '-':
+ if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
continue
row = CdxRow(
@@ -206,28 +222,31 @@ class CdxApiClient:
rows.append(row)
return rows
- def fetch(self,
- url: str,
- datetime: str,
- filter_status_code: Optional[int] = None,
- retry_sleep: Optional[int] = None) -> CdxRow:
+ def fetch(
+ self,
+ url: str,
+ datetime: str,
+ filter_status_code: Optional[int] = None,
+ retry_sleep: Optional[int] = None,
+ ) -> CdxRow:
"""
Fetches a single CDX row by url/datetime. Raises a KeyError if not
found, because we expect to be looking up a specific full record.
"""
if len(datetime) != 14:
raise ValueError(
- "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
+ "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)
+ )
params: Dict[str, str] = {
- 'url': url,
- 'from': datetime,
- 'to': datetime,
- 'matchType': 'exact',
- 'limit': "1",
- 'output': 'json',
+ "url": url,
+ "from": datetime,
+ "to": datetime,
+ "matchType": "exact",
+ "limit": "1",
+ "output": "json",
}
if filter_status_code:
- params['filter'] = "statuscode:{}".format(filter_status_code)
+ params["filter"] = "statuscode:{}".format(filter_status_code)
resp = self._query_api(params)
if not resp:
if retry_sleep and retry_sleep > 0:
@@ -235,37 +254,43 @@ class CdxApiClient:
if retry_sleep > 3:
next_sleep = retry_sleep - 3
retry_sleep = 3
- print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
- file=sys.stderr)
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url,
- datetime,
- filter_status_code=filter_status_code,
- retry_sleep=next_sleep)
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep
+ )
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
row = resp[0]
# allow fuzzy http/https match
if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
if retry_sleep and retry_sleep > 0:
- print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
- file=sys.stderr)
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url,
- datetime,
- filter_status_code=filter_status_code,
- retry_sleep=None)
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=None
+ )
raise KeyError(
"Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(
- url, datetime, row))
+ url, datetime, row
+ )
+ )
if filter_status_code:
assert row.status_code == filter_status_code
return row
- def lookup_best(self,
- url: str,
- max_age_days: Optional[int] = None,
- best_mimetype: Optional[str] = None,
- closest: Union[datetime.datetime, str, None] = None) -> Optional[CdxRow]:
+ def lookup_best(
+ self,
+ url: str,
+ max_age_days: Optional[int] = None,
+ best_mimetype: Optional[str] = None,
+ closest: Union[datetime.datetime, str, None] = None,
+ ) -> Optional[CdxRow]:
"""
Fetches multiple CDX rows for the given URL, tries to find the most recent.
@@ -289,27 +314,26 @@ class CdxApiClient:
"""
params: Dict[str, str] = {
- 'url': url,
- 'matchType': 'exact',
- 'limit': "-25",
- 'output': 'json',
+ "url": url,
+ "matchType": "exact",
+ "limit": "-25",
+ "output": "json",
# Collapsing seems efficient, but is complex; would need to include
# other filters and status code in filter
#'collapse': 'timestamp:6',
-
# Revisits now allowed and resolved!
#'filter': '!mimetype:warc/revisit',
}
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
- params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day)
+ params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
if closest:
if isinstance(closest, datetime.datetime):
- params['closest'] = '%04d%02d%02d' % (closest.year, closest.month, closest.day)
+ params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
else:
- params['closest'] = closest
- params['sort'] = "closest"
- #print(params, file=sys.stderr)
+ params["closest"] = closest
+ params["sort"] = "closest"
+ # print(params, file=sys.stderr)
rows = self._query_api(params)
if not rows:
return None
@@ -326,7 +350,7 @@ class CdxApiClient:
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
int(r.datetime[:6]),
- int('/' in r.warc_path),
+ int("/" in r.warc_path),
int(r.datetime),
)
@@ -358,25 +382,23 @@ class WaybackClient:
self.cdx_client = CdxApiClient()
# /serve/ instead of /download/ doesn't record view count
# this *does* want to be http://, not https://
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
self.petabox_webdata_secret = kwargs.get(
- 'petabox_webdata_secret',
- os.environ.get('PETABOX_WEBDATA_SECRET'),
+ "petabox_webdata_secret",
+ os.environ.get("PETABOX_WEBDATA_SECRET"),
)
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix", "https://archive.org/serve/")
self.rstore = None
self.max_redirects = 25
self.wayback_endpoint = "https://web.archive.org/web/"
self.replay_headers = {
- 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',
+ "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
}
- def fetch_petabox(self,
- csize: int,
- offset: int,
- warc_path: str,
- resolve_revisit: bool = True) -> WarcResource:
+ def fetch_petabox(
+ self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
+ ) -> WarcResource:
"""
Fetches wayback resource directly from petabox using WARC path/offset/csize.
@@ -401,37 +423,49 @@ class WaybackClient:
raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
if "/" not in warc_path:
raise ValueError(
- "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
+ "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)
+ )
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
self.rstore = ResourceStore(
- loaderfactory=CDXLoaderFactory3(webdata_secret=self.petabox_webdata_secret, ))
+ loaderfactory=CDXLoaderFactory3(
+ webdata_secret=self.petabox_webdata_secret,
+ )
+ )
assert self.rstore
try:
- #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
+ # print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
except wayback.exception.ResourceUnavailable:
print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
raise PetaboxError(
- "failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ "failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
except wayback.exception.InvalidResource:
print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
raise WaybackContentError(
- "failed to load file contents from wayback/petabox (InvalidResource)")
+ "failed to load file contents from wayback/petabox (InvalidResource)"
+ )
except urllib3.exceptions.ReadTimeoutError as rte:
raise PetaboxError(
- "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".
- format(rte))
+ "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(
+ rte
+ )
+ )
except ValueError as ve:
raise PetaboxError(
- "failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ "failed to load file contents from wayback/petabox (ValueError: {})".format(ve)
+ )
except EOFError as eofe:
raise PetaboxError(
- "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)
+ )
except TypeError as te:
raise PetaboxError(
- "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
- .format(te))
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ )
+ )
except Exception as e:
if "while decompressing data: invalid block type" in str(e):
raise PetaboxError(
@@ -449,8 +483,11 @@ class WaybackClient:
raise WaybackContentError("too many HTTP headers (in wayback fetch)")
location = gwb_record.get_location() or None
- if status_code is None and gwb_record.target_uri.startswith(
- b"ftp://") and not gwb_record.is_revisit():
+ if (
+ status_code is None
+ and gwb_record.target_uri.startswith(b"ftp://")
+ and not gwb_record.is_revisit()
+ ):
# TODO: some additional verification here?
status_code = 226
@@ -463,17 +500,19 @@ class WaybackClient:
if not (revisit_uri and revisit_dt):
raise WaybackContentError(
"revisit record missing URI and/or DT: warc:{} offset:{}".format(
- warc_path, offset))
+ warc_path, offset
+ )
+ )
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
assert len(revisit_dt) in (19, 20)
if type(revisit_uri) is bytes:
- revisit_uri = revisit_uri.decode('utf-8')
+ revisit_uri = revisit_uri.decode("utf-8")
if type(revisit_dt) is bytes:
- revisit_dt = revisit_dt.decode('utf-8')
- revisit_dt = revisit_dt.replace('-', '').replace(':',
- '').replace('T',
- '').replace('Z', '')
+ revisit_dt = revisit_dt.decode("utf-8")
+ revisit_dt = (
+ revisit_dt.replace("-", "").replace(":", "").replace("T", "").replace("Z", "")
+ )
assert len(revisit_dt) == 14
try:
revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
@@ -491,8 +530,10 @@ class WaybackClient:
body = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
raise WaybackError(
- "failed to read actual file contents from wayback/petabox (IncompleteRead: {})"
- .format(ire))
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ )
+ )
elif status_code is None:
raise WaybackContentError("got a None status_code in (W)ARC record")
return WarcResource(
@@ -502,12 +543,14 @@ class WaybackClient:
revisit_cdx=revisit_cdx,
)
- def fetch_petabox_body(self,
- csize: int,
- offset: int,
- warc_path: str,
- resolve_revisit: bool = True,
- expected_status_code: Optional[int] = None) -> bytes:
+ def fetch_petabox_body(
+ self,
+ csize: int,
+ offset: int,
+ warc_path: str,
+ resolve_revisit: bool = True,
+ expected_status_code: Optional[int] = None,
+ ) -> bytes:
"""
Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
@@ -524,20 +567,22 @@ class WaybackClient:
if expected_status_code:
if expected_status_code != resource.status_code:
- raise KeyError("archived HTTP response (WARC) was not {}: {}".format(
- expected_status_code,
- resource.status_code,
- ))
+ raise KeyError(
+ "archived HTTP response (WARC) was not {}: {}".format(
+ expected_status_code,
+ resource.status_code,
+ )
+ )
elif resource.status_code not in (200, 226):
- raise KeyError("archived HTTP response (WARC) was not 200: {}".format(
- resource.status_code))
+ raise KeyError(
+ "archived HTTP response (WARC) was not 200: {}".format(resource.status_code)
+ )
return resource.body
- def fetch_replay_body(self,
- url: str,
- datetime: str,
- cdx_sha1hex: Optional[str] = None) -> bytes:
+ def fetch_replay_body(
+ self, url: str, datetime: str, cdx_sha1hex: Optional[str] = None
+ ) -> bytes:
"""
Fetches an HTTP 200 record from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -570,32 +615,42 @@ class WaybackClient:
except UnicodeDecodeError:
raise WaybackContentError(
"UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
- url))
+ url
+ )
+ )
try:
resp.raise_for_status()
except Exception as e:
raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
+ # print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
if "X-Archive-Src" not in resp.headers:
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
- datetime, resp.url))
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
if cdx_sha1hex:
# verify that body matches CDX hash
# TODO: don't need *all* these hashes, just sha1
file_meta = gen_file_metadata(resp.content)
- if cdx_sha1hex != file_meta['sha1hex']:
- print(" REPLAY MISMATCH: cdx:{} replay:{}".format(cdx_sha1hex,
- file_meta['sha1hex']),
- file=sys.stderr)
+ if cdx_sha1hex != file_meta["sha1hex"]:
+ print(
+ " REPLAY MISMATCH: cdx:{} replay:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
+ file=sys.stderr,
+ )
raise WaybackContentError(
"replay fetch body didn't match CDX hash cdx:{} body:{}".format(
- cdx_sha1hex, file_meta['sha1hex']), )
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
+ )
return resp.content
def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]:
@@ -625,37 +680,44 @@ class WaybackClient:
except UnicodeDecodeError:
raise WaybackContentError(
"UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
- url))
+ url
+ )
+ )
try:
resp.raise_for_status()
except Exception as e:
raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
+ # print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
if "X-Archive-Src" not in resp.headers:
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
- datetime, resp.url))
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
redirect_url = resp.headers.get("Location")
# eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
redirect_url = "/".join(redirect_url.split("/")[5:])
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("http"):
redirect_url = clean_url(redirect_url)
return redirect_url
else:
return None
- def lookup_resource(self,
- start_url: str,
- best_mimetype: Optional[str] = None,
- closest: Union[str, datetime.datetime, None] = None) -> ResourceResult:
+ def lookup_resource(
+ self,
+ start_url: str,
+ best_mimetype: Optional[str] = None,
+ closest: Union[str, datetime.datetime, None] = None,
+ ) -> ResourceResult:
"""
Looks in wayback for a resource starting at the URL, following any
redirects. Returns a ResourceResult object, which may indicate a
@@ -684,8 +746,9 @@ class WaybackClient:
for i in range(self.max_redirects + 1):
print(" URL: {}".format(next_url), file=sys.stderr)
next_row: Optional[CdxRow] = self.cdx_client.lookup_best(
- next_url, best_mimetype=best_mimetype, closest=closest)
- #print(next_row, file=sys.stderr)
+ next_url, best_mimetype=best_mimetype, closest=closest
+ )
+ # print(next_row, file=sys.stderr)
if not next_row:
return ResourceResult(
start_url=start_url,
@@ -702,7 +765,7 @@ class WaybackClient:
cdx_row: CdxRow = next_row
# first try straight-forward redirect situation
- if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path:
+ if cdx_row.mimetype == "warc/revisit" and "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -725,7 +788,7 @@ class WaybackClient:
if cdx_row.status_code in (200, 226):
revisit_cdx = None
final_cdx: Union[CdxRow, CdxPartial] = cdx_row
- if '/' in cdx_row.warc_path:
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -751,7 +814,7 @@ class WaybackClient:
revisit_cdx=revisit_cdx,
)
elif 300 <= (cdx_row.status_code or 0) < 400:
- if '/' in cdx_row.warc_path:
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -848,34 +911,39 @@ class SavePageNowBackoffError(SandcrawlerBackoffError):
pass
-SavePageNowResult = namedtuple('SavePageNowResult', [
- 'success',
- 'status',
- 'job_id',
- 'request_url',
- 'terminal_url',
- 'terminal_dt',
- 'resources',
-])
+SavePageNowResult = namedtuple(
+ "SavePageNowResult",
+ [
+ "success",
+ "status",
+ "job_id",
+ "request_url",
+ "terminal_url",
+ "terminal_dt",
+ "resources",
+ ],
+)
class SavePageNowClient:
def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs):
- self.ia_access_key = kwargs.get('ia_access_key', os.environ.get('IA_ACCESS_KEY'))
- self.ia_secret_key = kwargs.get('ia_secret_key', os.environ.get('IA_SECRET_KEY'))
+ self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY"))
+ self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY"))
self.v2endpoint = v2endpoint
self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
- self.v2_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient',
- 'Accept': 'application/json',
- 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key),
- })
+ self.v2_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient",
+ "Accept": "application/json",
+ "Authorization": "LOW {}:{}".format(self.ia_access_key, self.ia_secret_key),
+ }
+ )
# 3 minutes total
self.poll_count = 60
self.poll_seconds = 3.0
- self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0)
+ self.spn_cdx_retry_sec = kwargs.get("spn_cdx_retry_sec", 9.0)
# these are special-case web domains for which we want SPN2 to not run
# a headless browser (brozzler), but instead simply run wget.
@@ -888,20 +956,20 @@ class SavePageNowClient:
"://europepmc.org/backend/ptpmcrender.fcgi",
"://pdfs.semanticscholar.org/",
"://res.mdpi.com/",
-
# platform sites
"://zenodo.org/",
"://figshare.org/",
"://springernature.figshare.com/",
-
# popular simple cloud storage or direct links
"://s3-eu-west-1.amazonaws.com/",
]
- def save_url_now_v2(self,
- request_url: str,
- force_simple_get: Optional[int] = None,
- capture_outlinks: int = 0) -> SavePageNowResult:
+ def save_url_now_v2(
+ self,
+ request_url: str,
+ force_simple_get: Optional[int] = None,
+ capture_outlinks: int = 0,
+ ) -> SavePageNowResult:
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -944,33 +1012,39 @@ class SavePageNowClient:
resp = self.v2_session.post(
self.v2endpoint,
data={
- 'url': request_url,
- 'capture_all': 1,
- 'capture_outlinks': capture_outlinks,
- 'capture_screenshot': 0,
- 'if_not_archived_within': '1d',
- 'force_get': force_simple_get,
- 'skip_first_archive': 1,
- 'outlinks_availability': 0,
- 'js_behavior_timeout': 0,
+ "url": request_url,
+ "capture_all": 1,
+ "capture_outlinks": capture_outlinks,
+ "capture_screenshot": 0,
+ "if_not_archived_within": "1d",
+ "force_get": force_simple_get,
+ "skip_first_archive": 1,
+ "outlinks_availability": 0,
+ "js_behavior_timeout": 0,
},
)
if resp.status_code == 429:
- raise SavePageNowBackoffError("status_code: {}, url: {}".format(
- resp.status_code, request_url))
+ raise SavePageNowBackoffError(
+ "status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
elif resp.status_code != 200:
- raise SavePageNowError("SPN2 status_code: {}, url: {}".format(
- resp.status_code, request_url))
+ raise SavePageNowError(
+ "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
resp_json = resp.json()
- if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json[
- 'message']:
- raise SavePageNowBackoffError(resp_json['message'])
- elif not resp_json or 'job_id' not in resp_json or not resp_json['job_id']:
+ if (
+ resp_json
+ and "message" in resp_json
+ and "You have already reached the limit of active sessions" in resp_json["message"]
+ ):
+ raise SavePageNowBackoffError(resp_json["message"])
+ elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]:
raise SavePageNowError(
- "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json))
+ "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)
+ )
- job_id = resp_json['job_id']
+ job_id = resp_json["job_id"]
print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
# poll until complete
@@ -981,53 +1055,59 @@ class SavePageNowClient:
resp.raise_for_status()
except Exception:
raise SavePageNowError(resp.content)
- status = resp.json()['status']
- if status == 'pending':
+ status = resp.json()["status"]
+ if status == "pending":
time.sleep(self.poll_seconds)
- elif status in ('success', 'error'):
+ elif status in ("success", "error"):
final_json = resp.json()
break
else:
- raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(
- status, request_url))
+ raise SavePageNowError(
+ "Unknown SPN2 status:{} url:{}".format(status, request_url)
+ )
if not final_json:
raise SavePageNowError("SPN2 timed out (polling count exceeded)")
# if there was a recent crawl of same URL, fetch the status of that
# crawl to get correct datetime
- if final_json.get('original_job_id'):
- print(f" SPN recent capture: {job_id} -> {final_json['original_job_id']}",
- file=sys.stderr)
- resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint,
- final_json['original_job_id']))
+ if final_json.get("original_job_id"):
+ print(
+ f" SPN recent capture: {job_id} -> {final_json['original_job_id']}",
+ file=sys.stderr,
+ )
+ resp = self.v2_session.get(
+ "{}/status/{}".format(self.v2endpoint, final_json["original_job_id"])
+ )
try:
resp.raise_for_status()
except Exception:
raise SavePageNowError(resp.content)
final_json = resp.json()
- #print(final_json, file=sys.stderr)
+ # print(final_json, file=sys.stderr)
- if final_json['status'] == "success":
- if final_json.get('original_url').startswith('/'):
- print(f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}",
- file=sys.stderr)
+ if final_json["status"] == "success":
+ if final_json.get("original_url").startswith("/"):
+ print(
+ f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}",
+ file=sys.stderr,
+ )
return SavePageNowResult(
True,
"success",
job_id,
request_url,
- final_json['original_url'],
- final_json['timestamp'],
- final_json['resources'],
+ final_json["original_url"],
+ final_json["timestamp"],
+ final_json["resources"],
)
else:
- if final_json['status'] == 'pending':
- final_json['status'] = 'error:pending'
+ if final_json["status"] == "pending":
+ final_json["status"] = "error:pending"
return SavePageNowResult(
False,
- final_json.get('status_ext') or final_json['status'],
+ final_json.get("status_ext") or final_json["status"],
job_id,
request_url,
None,
@@ -1035,10 +1115,12 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self,
- start_url: str,
- wayback_client: WaybackClient,
- force_simple_get: Optional[int] = None) -> ResourceResult:
+ def crawl_resource(
+ self,
+ start_url: str,
+ wayback_client: WaybackClient,
+ force_simple_get: Optional[int] = None,
+ ) -> ResourceResult:
"""
Runs a SPN2 crawl, then fetches body.
@@ -1048,18 +1130,23 @@ class SavePageNowClient:
"""
# HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
- if 'gzbd.cnki.net/' in start_url:
- spn_result = self.save_url_now_v2(start_url,
- force_simple_get=force_simple_get,
- capture_outlinks=1)
+ if "gzbd.cnki.net/" in start_url:
+ spn_result = self.save_url_now_v2(
+ start_url, force_simple_get=force_simple_get, capture_outlinks=1
+ )
else:
spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
if not spn_result.success:
status = spn_result.status
- if status in ("error:invalid-url", "error:not-found",
- "error:invalid-host-resolution", "error:gateway-timeout",
- "error:too-many-redirects", "error:read-timeout"):
+ if status in (
+ "error:invalid-url",
+ "error:not-found",
+ "error:invalid-host-resolution",
+ "error:gateway-timeout",
+ "error:too-many-redirects",
+ "error:read-timeout",
+ ):
status = status.replace("error:", "")
elif status in ("error:no-access", "error:forbidden"):
status = "forbidden"
@@ -1070,8 +1157,10 @@ class SavePageNowClient:
elif status.startswith("error:"):
status = "spn2-" + status
# despite other errors, call these a failure (so we don't retry)
- if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent')
- or spn_result.terminal_url.endswith("cookieSet=1")):
+ if spn_result.terminal_url and (
+ spn_result.terminal_url.endswith("/cookieAbsent")
+ or spn_result.terminal_url.endswith("cookieSet=1")
+ ):
status = "blocked-cookie"
return ResourceResult(
start_url=start_url,
@@ -1084,10 +1173,10 @@ class SavePageNowClient:
cdx=None,
revisit_cdx=None,
)
- #print(spn_result, file=sys.stderr)
+ # print(spn_result, file=sys.stderr)
# detect partial URL response (aka, success, but missing full URL)
- if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
+ if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith("/"):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -1102,7 +1191,8 @@ class SavePageNowClient:
# don't try to CDX fetch for this common cookie block terminal
if spn_result.terminal_url.endswith(
- '/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"):
+ "/cookieAbsent"
+ ) or spn_result.terminal_url.endswith("cookieSet=1"):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -1127,7 +1217,7 @@ class SavePageNowClient:
cdx_row = elsevier_pdf_cdx
else:
print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
- #print(elsevier_pdf_cdx, file=sys.stderr)
+ # print(elsevier_pdf_cdx, file=sys.stderr)
if not cdx_row:
# lookup exact
@@ -1164,11 +1254,11 @@ class SavePageNowClient:
revisit_cdx=None,
)
- #print(cdx_row, file=sys.stderr)
+ # print(cdx_row, file=sys.stderr)
revisit_cdx = None
final_cdx: Union[CdxRow, CdxPartial] = cdx_row
- if '/' in cdx_row.warc_path:
+ if "/" in cdx_row.warc_path:
# Usually can't do this kind of direct fetch because CDX result is recent/live
resource = wayback_client.fetch_petabox(
csize=cdx_row.warc_csize,
@@ -1228,12 +1318,19 @@ class SavePageNowClient:
)
-def fix_transfer_encoding(file_meta: dict,
- resource: ResourceResult) -> Tuple[dict, ResourceResult]:
- if resource.body and file_meta[
- 'mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
- print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype),
- file=sys.stderr)
+def fix_transfer_encoding(
+ file_meta: dict, resource: ResourceResult
+) -> Tuple[dict, ResourceResult]:
+ if (
+ resource.body
+ and file_meta["mimetype"] == "application/gzip"
+ and resource.cdx
+ and resource.cdx.mimetype != "application/gzip"
+ ):
+ print(
+ " transfer encoding not stripped: {}".format(resource.cdx.mimetype),
+ file=sys.stderr,
+ )
inner_body = gzip.decompress(resource.body)
if not inner_body:
raise Exception("null body inside transfer encoding")