aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 2f9c523..aedf2ff 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -10,7 +10,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import namedtuple
from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
from sandcrawler.grobid import GrobidClient
from sandcrawler.pdfextract import process_pdf, PdfExtractResult
from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
@@ -393,6 +393,10 @@ class IngestFileWorker(SandcrawlerWorker):
partial_result['status'] = 'wayback-content-error'
partial_result['error_message'] = str(e)[:1600]
return partial_result
+ except NoCaptureError as e:
+ partial_result['status'] = 'html-resource-no-capture'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
if self.htmlteixml_sink and html_body['status'] == "success":
self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex'])