diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-13 12:35:10 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-13 12:35:10 -0700 |
commit | 88eb9fa2442f24e9846413cff17461e29264ffd7 (patch) | |
tree | 0ac9c8d37574ab2386e7b46f121d33d4556d43eb | |
parent | 5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050 (diff) | |
download | sandcrawler-88eb9fa2442f24e9846413cff17461e29264ffd7.tar.gz sandcrawler-88eb9fa2442f24e9846413cff17461e29264ffd7.zip |
crawl: SPN2 non-200 success code path
-rw-r--r-- | python/sandcrawler/ia.py | 36 |
1 files changed, 25 insertions, 11 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 166cf20..68b3466 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1084,17 +1084,31 @@ class SavePageNowClient: # warc_path etc will change, so strip them out cdx_row = cdx_partial_from_row(cdx_row) - return ResourceResult( - start_url=start_url, - hit=True, - status="success", - terminal_url=cdx_row.url, - terminal_dt=cdx_row.datetime, - terminal_status_code=cdx_row.status_code, - body=body, - cdx=cdx_row, - revisit_cdx=revisit_cdx, - ) + assert cdx_row.status_code + if cdx_row.status_code in (200, 226): + return ResourceResult( + start_url=start_url, + hit=True, + status="success", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=body, + cdx=cdx_row, + revisit_cdx=revisit_cdx, + ) + else: + return ResourceResult( + start_url=start_url, + hit=False, + status="terminal-bad-status", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=body, + cdx=cdx_row, + revisit_cdx=revisit_cdx, + ) def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]: |