aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 10:46:49 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 10:46:51 -0800
commit9ca0c0bc13082aa061eee3da6b057fbacb4052e9 (patch)
tree8e83eef83c8bf494da6fb5ab7f01fb38fc1d05aa
parent15fbaa45003937db9414be729fda9615b960dbe1 (diff)
downloadsandcrawler-9ca0c0bc13082aa061eee3da6b057fbacb4052e9.tar.gz
sandcrawler-9ca0c0bc13082aa061eee3da6b057fbacb4052e9.zip
fetch_petabox_body: allow non-200 status code fetches
But only if it matches what the revisit record indicated. This is mostly to enable better revisit fetching.
-rw-r--r--python/sandcrawler/ia.py12
1 files changed, 10 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index cf99e83..c3ca80f 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -387,6 +387,7 @@ class WaybackClient:
offset=revisit_cdx.warc_offset,
warc_path=revisit_cdx.warc_path,
resolve_revisit=False,
+ expected_status_code=revisit_cdx.status_code,
)
elif status_code in (200, 226):
try:
@@ -404,7 +405,7 @@ class WaybackClient:
revisit_cdx=revisit_cdx,
)
- def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True):
+ def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None):
"""
Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
@@ -419,7 +420,14 @@ class WaybackClient:
resolve_revisit=resolve_revisit,
)
- if resource.status_code not in (200, 226):
+ if expected_status_code:
+ if expected_status_code != resource.status_code:
+ raise KeyError("archived HTTP response (WARC) was not {}: {}".format(
+ expected_status_code,
+ resource.status_code,
+ )
+ )
+ elif resource.status_code not in (200, 226):
raise KeyError("archived HTTP response (WARC) was not 200: {}".format(
resource.status_code)
)