aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py13
1 files changed, 11 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 8f7220d..2566973 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -5,7 +5,7 @@ import base64
import requests
from http.server import BaseHTTPRequestHandler, HTTPServer
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError, CdxApiError
from sandcrawler.grobid import GrobidClient
from sandcrawler.misc import gen_file_metadata
from sandcrawler.html import extract_fulltext_url
@@ -101,8 +101,17 @@ class IngestFileWorker(SandcrawlerWorker):
while url:
try:
(cdx_dict, body) = self.get_cdx_and_body(url)
- except SavePageNowRemoteError:
+ except SavePageNowRemoteError as e:
response['status'] = 'spn-remote-error'
+ response['error_message'] = str(e)
+ return response
+ except SavePageNowError as e:
+ response['status'] = 'spn-error'
+ response['error_message'] = str(e)
+ return response
+ except CdxApiError as e:
+ response['status'] = 'cdx-error'
+ response['error_message'] = str(e)
return response
sys.stderr.write("CDX hit: {}\n".format(cdx_dict))