aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-19 16:25:30 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-19 16:25:30 -0700
commite9e651635e157a414163fbd9902db7e3908774ff (patch)
tree62735c2894b2b495207e20ae3ee5fc4087b1eda2
parentd916a3c6b99e74845aff8d0edc9709fd86f8d9b6 (diff)
downloadsandcrawler-e9e651635e157a414163fbd9902db7e3908774ff.tar.gz
sandcrawler-e9e651635e157a414163fbd9902db7e3908774ff.zip
ingest: catch wayback-fail-after-SPN as separate status
-rw-r--r--python/sandcrawler/ia.py21
1 files changed, 17 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 0bc3a97..28eda7c 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1017,10 +1017,23 @@ class SavePageNowClient:
revisit_cdx = resource.revisit_cdx
else:
# note: currently not trying to verify cdx_row.sha1hex
- body = wayback_client.fetch_replay_body(
- url=cdx_row.url,
- datetime=cdx_row.datetime,
- )
+ try:
+ body = wayback_client.fetch_replay_body(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ except WaybackError as we:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-wayback-error",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
# warc_path etc will change, so strip them out
cdx_row = cdx_partial_from_row(cdx_row)