From e9e651635e157a414163fbd9902db7e3908774ff Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 19 Oct 2020 16:25:30 -0700 Subject: ingest: catch wayback-fail-after-SPN as separate status --- python/sandcrawler/ia.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'python/sandcrawler/ia.py') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 0bc3a97..28eda7c 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1017,10 +1017,23 @@ class SavePageNowClient: revisit_cdx = resource.revisit_cdx else: # note: currently not trying to verify cdx_row.sha1hex - body = wayback_client.fetch_replay_body( - url=cdx_row.url, - datetime=cdx_row.datetime, - ) + try: + body = wayback_client.fetch_replay_body( + url=cdx_row.url, + datetime=cdx_row.datetime, + ) + except WaybackError as we: + return ResourceResult( + start_url=start_url, + hit=False, + status="spn2-wayback-error", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) # warc_path etc will change, so strip them out cdx_row = cdx_partial_from_row(cdx_row) -- cgit v1.2.3