From b9ad3a9468086940aaba15ce105c092f19e57092 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 Nov 2019 20:21:08 -0800 Subject: allow way more errors in SPN path --- python/sandcrawler/ingest.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 8f7220d..2566973 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -5,7 +5,7 @@ import base64 import requests from http.server import BaseHTTPRequestHandler, HTTPServer -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError, CdxApiError from sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url @@ -101,8 +101,17 @@ class IngestFileWorker(SandcrawlerWorker): while url: try: (cdx_dict, body) = self.get_cdx_and_body(url) - except SavePageNowRemoteError: + except SavePageNowRemoteError as e: response['status'] = 'spn-remote-error' + response['error_message'] = str(e) + return response + except SavePageNowError as e: + response['status'] = 'spn-error' + response['error_message'] = str(e) + return response + except CdxApiError as e: + response['status'] = 'cdx-error' + response['error_message'] = str(e) return response sys.stderr.write("CDX hit: {}\n".format(cdx_dict)) -- cgit v1.2.3