aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-13 12:35:10 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-13 12:35:10 -0700
commit88eb9fa2442f24e9846413cff17461e29264ffd7 (patch)
tree0ac9c8d37574ab2386e7b46f121d33d4556d43eb /python/sandcrawler/ia.py
parent5cc02ee97b39d72c88dd2e0755abf6f0d3a4b050 (diff)
downloadsandcrawler-88eb9fa2442f24e9846413cff17461e29264ffd7.tar.gz
sandcrawler-88eb9fa2442f24e9846413cff17461e29264ffd7.zip
crawl: SPN2 non-200 success code path
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py36
1 files changed, 25 insertions, 11 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 166cf20..68b3466 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1084,17 +1084,31 @@ class SavePageNowClient:
# warc_path etc will change, so strip them out
cdx_row = cdx_partial_from_row(cdx_row)
- return ResourceResult(
- start_url=start_url,
- hit=True,
- status="success",
- terminal_url=cdx_row.url,
- terminal_dt=cdx_row.datetime,
- terminal_status_code=cdx_row.status_code,
- body=body,
- cdx=cdx_row,
- revisit_cdx=revisit_cdx,
- )
+ assert cdx_row.status_code
+ if cdx_row.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
+ )
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
+ )
def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]: