From 200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 21 Oct 2020 12:20:52 -0700 Subject: differential wayback-error from wayback-content-error The motivation here is to distinguish errors due to current content in wayback (eg, in WARCs) from operational errors (eg, wayback machine is down, or network failures/disruption). --- python/sandcrawler/workers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'python/sandcrawler/workers.py') diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 814cbf3..37e3d7a 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -10,7 +10,7 @@ from collections import Counter from confluent_kafka import Consumer, Producer, KafkaException from .misc import parse_cdx_line -from .ia import SandcrawlerBackoffError, WaybackError, PetaboxError +from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError class SandcrawlerWorker(object): @@ -135,7 +135,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): warc_path=record['warc_path'], ) wayback_sec = time.time() - start - except (WaybackError, PetaboxError, KeyError) as we: + except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we: return dict( key=default_key, source=record, @@ -153,7 +153,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): datetime=record['datetime'], ) wayback_sec = time.time() - start - except WaybackError as we: + except (WaybackError, WaybackContentError) as we: return dict( key=default_key, source=record, -- cgit v1.2.3