diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 20:21:08 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 20:21:08 -0800 |
commit | b9ad3a9468086940aaba15ce105c092f19e57092 (patch) | |
tree | ae0af4ab16d716f8c91a135c0ddc3d8710d1203d | |
parent | cab64d549fec4a6119b9c623748624ac70c8454b (diff) | |
download | sandcrawler-b9ad3a9468086940aaba15ce105c092f19e57092.tar.gz sandcrawler-b9ad3a9468086940aaba15ce105c092f19e57092.zip |
allow way more errors in SPN path
-rw-r--r-- | python/sandcrawler/ingest.py | 13 |
1 files changed, 11 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 8f7220d..2566973 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -5,7 +5,7 @@ import base64 import requests from http.server import BaseHTTPRequestHandler, HTTPServer -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError, CdxApiError from sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url @@ -101,8 +101,17 @@ class IngestFileWorker(SandcrawlerWorker): while url: try: (cdx_dict, body) = self.get_cdx_and_body(url) - except SavePageNowRemoteError: + except SavePageNowRemoteError as e: response['status'] = 'spn-remote-error' + response['error_message'] = str(e) + return response + except SavePageNowError as e: + response['status'] = 'spn-error' + response['error_message'] = str(e) + return response + except CdxApiError as e: + response['status'] = 'cdx-error' + response['error_message'] = str(e) return response sys.stderr.write("CDX hit: {}\n".format(cdx_dict)) |