aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-06 18:18:39 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-06 18:18:39 -0800
commitb86a6fd5bb74f9f11e682b9a98f02b5dba8c4cc1 (patch)
tree13a1cfe475a0af13571e9990e0947e367a12ae4e
parent8958b12ff12c59f1c1f7267a509a99bfaa14c7d7 (diff)
downloadsandcrawler-b86a6fd5bb74f9f11e682b9a98f02b5dba8c4cc1.tar.gz
sandcrawler-b86a6fd5bb74f9f11e682b9a98f02b5dba8c4cc1.zip
html: catch and report exceptions at process_hit() stage
-rw-r--r--python/sandcrawler/ingest.py31
1 files changed, 27 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index f696231..0c8eee6 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -366,10 +366,33 @@ class IngestFileWorker(SandcrawlerWorker):
when = parse_cdx_datetime(resource.cdx.datetime)
full_resources: List[WebResource] = []
- if self.html_quick_mode:
- full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when)
- else:
- full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+
+ partial_result = dict(
+ html_biblio=html_biblio_dict,
+ html_scope=html_scope,
+ )
+
+ try:
+ if self.html_quick_mode:
+ full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when)
+ else:
+ full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ except PetaboxError as e:
+ partial_result['status'] = 'petabox-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except CdxApiError as e:
+ partial_result['status'] = 'cdx-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except WaybackError as e:
+ partial_result['status'] = 'wayback-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except WaybackContentError as e:
+ partial_result['status'] = 'wayback-content-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
if self.htmlteixml_sink and html_body['status'] == "success":
self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex'])