diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-28 12:52:24 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-28 12:52:24 -0800 |
commit | 084807ee51f6b5844b323a1217a70b2f12ee966d (patch) | |
tree | 713c777aeb0b5fffb5f53fd97f1ee01c8fd44a06 /python | |
parent | 2e93f94c9ebba689dde252ca8f5b106765cece88 (diff) | |
download | sandcrawler-084807ee51f6b5844b323a1217a70b2f12ee966d.tar.gz sandcrawler-084807ee51f6b5844b323a1217a70b2f12ee966d.zip |
clarify petabox fetch behavior
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index b5883e5..9bed59d 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -271,8 +271,8 @@ class WaybackClient: If there is a problem with petabox, raises a PetaboxError. If resource doesn't exist, would raise a KeyError (TODO). - The full record is returned as crawled; it may be a redirect, 404 - response, etc. + The body is only returned if the record is success (HTTP 200 or + equivalent). Otherwise only the status and header info is returned. WarcResource object (namedtuple) contains fields: - status_code: int @@ -325,7 +325,7 @@ class WaybackClient: raise WaybackError( "found revisit record, but won't resolve (loop?)") revisit_uri, revisit_dt = gwb_record.refers_to # convert revisit_dt - assert len(revisit_dt) == len("2018-07-24T11:56:49") + assert len(revisit_dt) == 19 # len("2018-07-24T11:56:49") revisit_uri = revisit_uri.decode('utf-8') revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '') revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) @@ -341,6 +341,9 @@ class WaybackClient: except IncompleteRead as ire: raise WaybackError( "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + else if status_code is None: + raise WaybackError( + "got a None status_code in (W)ARC record".format(ire)) return WarcResource( status_code=status_code, location=location, |