diff options
-rw-r--r-- | python/sandcrawler/ia.py | 36 |
1 files changed, 25 insertions, 11 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 166cf20..68b3466 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1084,17 +1084,31 @@ class SavePageNowClient: # warc_path etc will change, so strip them out cdx_row = cdx_partial_from_row(cdx_row) - return ResourceResult( - start_url=start_url, - hit=True, - status="success", - terminal_url=cdx_row.url, - terminal_dt=cdx_row.datetime, - terminal_status_code=cdx_row.status_code, - body=body, - cdx=cdx_row, - revisit_cdx=revisit_cdx, - ) + assert cdx_row.status_code + if cdx_row.status_code in (200, 226): + return ResourceResult( + start_url=start_url, + hit=True, + status="success", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=body, + cdx=cdx_row, + revisit_cdx=revisit_cdx, + ) + else: + return ResourceResult( + start_url=start_url, + hit=False, + status="terminal-bad-status", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=body, + cdx=cdx_row, + revisit_cdx=revisit_cdx, + ) def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]: |