diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 18:18:39 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 18:18:39 -0800 |
commit | b86a6fd5bb74f9f11e682b9a98f02b5dba8c4cc1 (patch) | |
tree | 13a1cfe475a0af13571e9990e0947e367a12ae4e | |
parent | 8958b12ff12c59f1c1f7267a509a99bfaa14c7d7 (diff) | |
download | sandcrawler-b86a6fd5bb74f9f11e682b9a98f02b5dba8c4cc1.tar.gz sandcrawler-b86a6fd5bb74f9f11e682b9a98f02b5dba8c4cc1.zip |
html: catch and report exceptions at process_hit() stage
-rw-r--r-- | python/sandcrawler/ingest.py | 31 |
1 files changed, 27 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index f696231..0c8eee6 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -366,10 +366,33 @@ class IngestFileWorker(SandcrawlerWorker): when = parse_cdx_datetime(resource.cdx.datetime) full_resources: List[WebResource] = [] - if self.html_quick_mode: - full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when) - else: - full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) + + partial_result = dict( + html_biblio=html_biblio_dict, + html_scope=html_scope, + ) + + try: + if self.html_quick_mode: + full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when) + else: + full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) + except PetaboxError as e: + partial_result['status'] = 'petabox-error' + partial_result['error_message'] = str(e)[:1600] + return partial_result + except CdxApiError as e: + partial_result['status'] = 'cdx-error' + partial_result['error_message'] = str(e)[:1600] + return partial_result + except WaybackError as e: + partial_result['status'] = 'wayback-error' + partial_result['error_message'] = str(e)[:1600] + return partial_result + except WaybackContentError as e: + partial_result['status'] = 'wayback-content-error' + partial_result['error_message'] = str(e)[:1600] + return partial_result if self.htmlteixml_sink and html_body['status'] == "success": self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex']) |