diff options
-rw-r--r-- | python/sandcrawler/ingest.py | 13 |
1 files changed, 11 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 8f7220d..2566973 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -5,7 +5,7 @@ import base64 import requests from http.server import BaseHTTPRequestHandler, HTTPServer -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError, CdxApiError from sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url @@ -101,8 +101,17 @@ class IngestFileWorker(SandcrawlerWorker): while url: try: (cdx_dict, body) = self.get_cdx_and_body(url) - except SavePageNowRemoteError: + except SavePageNowRemoteError as e: response['status'] = 'spn-remote-error' + response['error_message'] = str(e) + return response + except SavePageNowError as e: + response['status'] = 'spn-error' + response['error_message'] = str(e) + return response + except CdxApiError as e: + response['status'] = 'cdx-error' + response['error_message'] = str(e) return response sys.stderr.write("CDX hit: {}\n".format(cdx_dict)) |