From abe36a83d189e13f3fe20519ccc4d90114e71455 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 18:02:05 -0800 Subject: ingest html: return better status based on sniffed scope --- python/sandcrawler/ingest.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index d95b8bf..fb442d9 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -343,19 +343,40 @@ class IngestFileWorker(SandcrawlerWorker): def process_html(self, resource: ResourceResult, file_meta: dict) -> dict: - html_doc = HTMLParser(resource.body) + assert resource.body + try: + html_doc = HTMLParser(resource.body) + except ValueError as ve: + return dict( + status="html-selectolax-error", + ) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) assert html_biblio html_body = html_extract_body_teixml(resource.body) html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count')) html_biblio_dict = json.loads(html_biblio.json(exclude_none=True)) - if html_scope not in ('article-fulltext', 'unknown'): + if html_scope in ('blocked-captcha','blocked-cookie','blocked-forbidden'): + return dict( + status=html_scope, + html_biblio=html_biblio_dict, + html_scope=html_scope, + ) + elif html_scope == 'unknown': + html_body.pop("tei_xml", None) + return dict( + status="unknown-scope", + html_biblio=html_biblio_dict, + html_scope=html_scope, + html_body=html_body, + ) + elif html_scope not in ('article-fulltext',): html_body.pop("tei_xml", None) return dict( status="wrong-scope", html_biblio=html_biblio_dict, html_scope=html_scope, + html_body=html_body, ) raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules) @@ -365,17 +386,23 @@ class IngestFileWorker(SandcrawlerWorker): status="too-many-resources", html_biblio=html_biblio_dict, html_scope=html_scope, + html_body=html_body, ) - when = parse_cdx_datetime(resource.cdx.datetime) + if self.htmlteixml_sink and html_body['status'] == "success": + self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex']) - full_resources: List[WebResource] = [] + html_body.pop("tei_xml", None) partial_result = dict( html_biblio=html_biblio_dict, html_scope=html_scope, + html_body=html_body, ) + when = parse_cdx_datetime(resource.cdx.datetime) + full_resources: List[WebResource] = [] + try: if self.html_quick_mode: print(" WARN: running quick CDX-only fetches", file=sys.stderr) @@ -403,11 +430,6 @@ class IngestFileWorker(SandcrawlerWorker): partial_result['error_message'] = str(e)[:1600] return partial_result - if self.htmlteixml_sink and html_body['status'] == "success": - self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex']) - - html_body.pop("tei_xml", None) - return dict( html_body=html_body, html_biblio=html_biblio_dict, -- cgit v1.2.3