aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/ia.py36
1 files changed, 25 insertions, 11 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 166cf20..68b3466 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1084,17 +1084,31 @@ class SavePageNowClient:
# warc_path etc will change, so strip them out
cdx_row = cdx_partial_from_row(cdx_row)
- return ResourceResult(
- start_url=start_url,
- hit=True,
- status="success",
- terminal_url=cdx_row.url,
- terminal_dt=cdx_row.datetime,
- terminal_status_code=cdx_row.status_code,
- body=body,
- cdx=cdx_row,
- revisit_cdx=revisit_cdx,
- )
+ assert cdx_row.status_code
+ if cdx_row.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
+ )
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
+ )
def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]: