aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/workers.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-21 12:20:52 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-21 12:20:54 -0700
commit200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 (patch)
tree4f010e66a059271ac3b9c496d15a3bc90bd763c4 /python/sandcrawler/workers.py
parent33249f2679851afb64142c428be45d16f35f5539 (diff)
downloadsandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.tar.gz
sandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.zip
differential wayback-error from wayback-content-error
The motivation here is to distinguish errors due to current content in wayback (eg, in WARCs) from operational errors (eg, wayback machine is down, or network failures/disruption).
Diffstat (limited to 'python/sandcrawler/workers.py')
-rw-r--r--python/sandcrawler/workers.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 814cbf3..37e3d7a 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -10,7 +10,7 @@ from collections import Counter
from confluent_kafka import Consumer, Producer, KafkaException
from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, PetaboxError
+from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
class SandcrawlerWorker(object):
@@ -135,7 +135,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
warc_path=record['warc_path'],
)
wayback_sec = time.time() - start
- except (WaybackError, PetaboxError, KeyError) as we:
+ except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we:
return dict(
key=default_key,
source=record,
@@ -153,7 +153,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
datetime=record['datetime'],
)
wayback_sec = time.time() - start
- except WaybackError as we:
+ except (WaybackError, WaybackContentError) as we:
return dict(
key=default_key,
source=record,