From 88eb9fa2442f24e9846413cff17461e29264ffd7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 13 Jul 2021 12:35:10 -0700 Subject: crawl: SPN2 non-200 success code path --- python/sandcrawler/ia.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'python/sandcrawler/ia.py') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 166cf20..68b3466 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1084,17 +1084,31 @@ class SavePageNowClient: # warc_path etc will change, so strip them out cdx_row = cdx_partial_from_row(cdx_row) - return ResourceResult( - start_url=start_url, - hit=True, - status="success", - terminal_url=cdx_row.url, - terminal_dt=cdx_row.datetime, - terminal_status_code=cdx_row.status_code, - body=body, - cdx=cdx_row, - revisit_cdx=revisit_cdx, - ) + assert cdx_row.status_code + if cdx_row.status_code in (200, 226): + return ResourceResult( + start_url=start_url, + hit=True, + status="success", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=body, + cdx=cdx_row, + revisit_cdx=revisit_cdx, + ) + else: + return ResourceResult( + start_url=start_url, + hit=False, + status="terminal-bad-status", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=body, + cdx=cdx_row, + revisit_cdx=revisit_cdx, + ) def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]: -- cgit v1.2.3