aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-13 20:21:08 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-13 20:21:08 -0800
commitb9ad3a9468086940aaba15ce105c092f19e57092 (patch)
treeae0af4ab16d716f8c91a135c0ddc3d8710d1203d /python
parentcab64d549fec4a6119b9c623748624ac70c8454b (diff)
downloadsandcrawler-b9ad3a9468086940aaba15ce105c092f19e57092.tar.gz
sandcrawler-b9ad3a9468086940aaba15ce105c092f19e57092.zip
allow way more errors in SPN path
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py13
1 files changed, 11 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 8f7220d..2566973 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -5,7 +5,7 @@ import base64
import requests
from http.server import BaseHTTPRequestHandler, HTTPServer
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError, CdxApiError
from sandcrawler.grobid import GrobidClient
from sandcrawler.misc import gen_file_metadata
from sandcrawler.html import extract_fulltext_url
@@ -101,8 +101,17 @@ class IngestFileWorker(SandcrawlerWorker):
while url:
try:
(cdx_dict, body) = self.get_cdx_and_body(url)
- except SavePageNowRemoteError:
+ except SavePageNowRemoteError as e:
response['status'] = 'spn-remote-error'
+ response['error_message'] = str(e)
+ return response
+ except SavePageNowError as e:
+ response['status'] = 'spn-error'
+ response['error_message'] = str(e)
+ return response
+ except CdxApiError as e:
+ response['status'] = 'cdx-error'
+ response['error_message'] = str(e)
return response
sys.stderr.write("CDX hit: {}\n".format(cdx_dict))