From 200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 21 Oct 2020 12:20:52 -0700 Subject: differential wayback-error from wayback-content-error The motivation here is to distinguish errors due to current content in wayback (eg, in WARCs) from operational errors (eg, wayback machine is down, or network failures/disruption). --- python/sandcrawler/ingest.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler/ingest.py') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 322859a..57988e8 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -8,7 +8,7 @@ import requests from http.server import BaseHTTPRequestHandler, HTTPServer from collections import namedtuple -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient from sandcrawler.pdfextract import process_pdf, PdfExtractResult from sandcrawler.misc import gen_file_metadata, clean_url @@ -388,6 +388,10 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = 'wayback-error' result['error_message'] = str(e)[:1600] return result + except WaybackContentError as e: + result['status'] = 'wayback-content-error' + result['error_message'] = str(e)[:1600] + return result except NotImplementedError as e: result['status'] = 'not-implemented' result['error_message'] = str(e)[:1600] -- cgit v1.2.3