From 9ca0c0bc13082aa061eee3da6b057fbacb4052e9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 10:46:49 -0800 Subject: fetch_petabox_body: allow non-200 status code fetches But only if it matches what the revisit record indicated. This is mostly to enable better revisit fetching. --- python/sandcrawler/ia.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index cf99e83..c3ca80f 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -387,6 +387,7 @@ class WaybackClient: offset=revisit_cdx.warc_offset, warc_path=revisit_cdx.warc_path, resolve_revisit=False, + expected_status_code=revisit_cdx.status_code, ) elif status_code in (200, 226): try: @@ -404,7 +405,7 @@ class WaybackClient: revisit_cdx=revisit_cdx, ) - def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True): + def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None): """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -419,7 +420,14 @@ class WaybackClient: resolve_revisit=resolve_revisit, ) - if resource.status_code not in (200, 226): + if expected_status_code: + if expected_status_code != resource.status_code: + raise KeyError("archived HTTP response (WARC) was not {}: {}".format( + expected_status_code, + resource.status_code, + ) + ) + elif resource.status_code not in (200, 226): raise KeyError("archived HTTP response (WARC) was not 200: {}".format( resource.status_code) ) -- cgit v1.2.3