diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 10:46:49 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 10:46:51 -0800 |
commit | 9ca0c0bc13082aa061eee3da6b057fbacb4052e9 (patch) | |
tree | 8e83eef83c8bf494da6fb5ab7f01fb38fc1d05aa /python | |
parent | 15fbaa45003937db9414be729fda9615b960dbe1 (diff) | |
download | sandcrawler-9ca0c0bc13082aa061eee3da6b057fbacb4052e9.tar.gz sandcrawler-9ca0c0bc13082aa061eee3da6b057fbacb4052e9.zip |
fetch_petabox_body: allow non-200 status code fetches
But only if it matches what the revisit record indicated.
This is mostly to enable better revisit fetching.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index cf99e83..c3ca80f 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -387,6 +387,7 @@ class WaybackClient: offset=revisit_cdx.warc_offset, warc_path=revisit_cdx.warc_path, resolve_revisit=False, + expected_status_code=revisit_cdx.status_code, ) elif status_code in (200, 226): try: @@ -404,7 +405,7 @@ class WaybackClient: revisit_cdx=revisit_cdx, ) - def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True): + def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None): """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -419,7 +420,14 @@ class WaybackClient: resolve_revisit=resolve_revisit, ) - if resource.status_code not in (200, 226): + if expected_status_code: + if expected_status_code != resource.status_code: + raise KeyError("archived HTTP response (WARC) was not {}: {}".format( + expected_status_code, + resource.status_code, + ) + ) + elif resource.status_code not in (200, 226): raise KeyError("archived HTTP response (WARC) was not 200: {}".format( resource.status_code) ) |