From 2d338ab3649642affbedeb28470a96a6a5ba7597 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 20 Nov 2020 11:04:31 -0800 Subject: handle more wayback error conditions --- python/sandcrawler/ia.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 4da07af..cb4fdf3 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -11,6 +11,7 @@ import json import requests import datetime import urllib.parse +import urllib3.exceptions from typing import Tuple from collections import namedtuple @@ -373,6 +374,11 @@ class WaybackClient: except wayback.exception.ResourceUnavailable: print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)") + except wayback.exception.InvalidResource: + print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) + raise WaybackContentError("failed to load file contents from wayback/petabox (InvalidResource)") + except urllib3.exceptions.ReadTimeoutError as rte: + raise PetaboxError("failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(rte)) except ValueError as ve: raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) except EOFError as eofe: -- cgit v1.2.3