aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py42
1 files changed, 23 insertions, 19 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index ba9b5b9..ac0fef8 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -13,7 +13,7 @@ from http.client import IncompleteRead
from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-from .misc import b32_hex, requests_retry_session
+from .misc import b32_hex, requests_retry_session, gen_file_metadata
ResourceResult = namedtuple("ResourceResult", [
@@ -106,15 +106,15 @@ class CdxApiClient:
status_code=int(raw[4]),
sha1b32=raw[5],
sha1hex=b32_hex(raw[5]),
- warc_csize=raw[8],
- warc_offset=raw[9],
+ warc_csize=int(raw[8]),
+ warc_offset=int(raw[9]),
warc_path=raw[10],
)
assert (row.mimetype == "-") or ("-" not in row)
rows.append(row)
return rows
- def fetch(self, url, datetime):
+ def fetch(self, url, datetime, filter_status_code=None):
"""
Fetches a single CDX row by url/datetime. Raises a KeyError if not
found, because we expect to be looking up a specific full record.
@@ -127,9 +127,10 @@ class CdxApiClient:
'to': datetime,
'matchType': 'exact',
'limit': -1,
- 'fastLatest': True,
'output': 'json',
}
+ if filter_status_code:
+ params['filter'] = "statuscode:{}".format(filter_status_code)
resp = self._query_api(params)
if not resp:
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
@@ -148,9 +149,9 @@ class CdxApiClient:
'url': url,
'matchType': 'exact',
'limit': -25,
- 'fastLatest': True,
'output': 'json',
'collapse': 'timestamp:6',
+ 'filter': '!mimetype:warc/revisit',
}
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
@@ -165,9 +166,11 @@ class CdxApiClient:
200
mimetype match
- most-recent
+ not-liveweb
+ most-recent
no match
- most-recent
+ not-liveweb
+ most-recent
3xx
most-recent
4xx
@@ -178,10 +181,11 @@ class CdxApiClient:
This function will create a tuple that can be used to sort in *reverse* order.
"""
return (
- r.status_code == 200,
- 0 - r.status_code,
- r.mimetype == best_mimetype,
- r.datetime,
+ int(r.status_code == 200),
+ int(0 - r.status_code),
+ int(r.mimetype == best_mimetype),
+ int('/' in r.warc_path),
+ int(r.datetime),
)
rows = sorted(rows, key=cdx_sort_key)
@@ -251,7 +255,7 @@ class WaybackClient:
# whole cluster is down though.
status_code = gwb_record.get_status()[0]
- location = gwb_record.get_location()[0]
+ location = (gwb_record.get_location() or [None])[0]
body = None
if status_code == 200:
@@ -280,7 +284,7 @@ class WaybackClient:
return resource.body
- def fetch_replay(self, url, datetime):
+ def fetch_replay_body(self, url, datetime):
"""
Fetches an HTTP 200 record from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -327,8 +331,8 @@ class WaybackClient:
body=None,
cdx=None,
)
- if cdx.status_code == 200:
- body = self.fetch_petabox_body(cdx.warc_csize, cdx.warc_offset, cdx_row.warc_path)
+ if cdx_row.status_code == 200:
+ body = self.fetch_petabox_body(cdx_row.warc_csize, cdx_row.warc_offset, cdx_row.warc_path)
return ResourceResult(
start_url=start_url,
hit=True,
@@ -360,7 +364,7 @@ class WaybackClient:
return ResourceResult(
start_url=start_url,
hit=False,
- status="terminal-not-success",
+ status="terminal-bad-status",
terminal_url=cdx_row.url,
terminal_dt=cdx_row.datetime,
terminal_status_code=cdx_row.status_code,
@@ -506,7 +510,7 @@ class SavePageNowClient:
status=spn_result.status,
terminal_url=spn_result.terminal_url,
terminal_dt=spn_result.terminal_dt,
- terminal_status_code=spn_result.terminal_status_code,
+ terminal_status_code=None,
body=None,
cdx=None,
)
@@ -523,7 +527,7 @@ class SavePageNowClient:
hit=True,
status="success",
terminal_url=cdx_row.url,
- terminal_dt=cdx_row.status_code,
+ terminal_dt=cdx_row.datetime,
terminal_status_code=cdx_row.status_code,
body=body,
cdx=cdx_partial,