diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 16:25:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 16:25:30 -0700 |
commit | e9e651635e157a414163fbd9902db7e3908774ff (patch) | |
tree | 62735c2894b2b495207e20ae3ee5fc4087b1eda2 /python | |
parent | d916a3c6b99e74845aff8d0edc9709fd86f8d9b6 (diff) | |
download | sandcrawler-e9e651635e157a414163fbd9902db7e3908774ff.tar.gz sandcrawler-e9e651635e157a414163fbd9902db7e3908774ff.zip |
ingest: catch wayback-fail-after-SPN as separate status
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 21 |
1 files changed, 17 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 0bc3a97..28eda7c 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1017,10 +1017,23 @@ class SavePageNowClient: revisit_cdx = resource.revisit_cdx else: # note: currently not trying to verify cdx_row.sha1hex - body = wayback_client.fetch_replay_body( - url=cdx_row.url, - datetime=cdx_row.datetime, - ) + try: + body = wayback_client.fetch_replay_body( + url=cdx_row.url, + datetime=cdx_row.datetime, + ) + except WaybackError as we: + return ResourceResult( + start_url=start_url, + hit=False, + status="spn2-wayback-error", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) # warc_path etc will change, so strip them out cdx_row = cdx_partial_from_row(cdx_row) |