From de71aa92d4c7c9d14dfccc0188032d4e7b10090f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 4 Nov 2020 18:10:00 -0800 Subject: html: actually publish HTML TEI-XML to body; fix dataflow though ingest a bit --- python/sandcrawler/ingest.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index cc64fa5..e0778d2 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -73,6 +73,7 @@ class IngestFileWorker(SandcrawlerWorker): self.thumbnail_sink = kwargs.get('thumbnail_sink') self.pdftext_sink = kwargs.get('pdftext_sink') self.xmldoc_sink = kwargs.get('xmldoc_sink') + self.htmlteixml_sink = kwargs.get('htmlteixml_sink') self.max_hops = 6 self.try_existing_ingest = kwargs.get('try_existing_ingest', False) @@ -82,6 +83,7 @@ class IngestFileWorker(SandcrawlerWorker): self.try_spn2 = kwargs.get('try_spn2', True) self.html_quick_mode = False self.adblock_rules = load_adblock_rules() + self.max_html_resources = 200 self.base_url_blocklist = [ # robot blocking @@ -339,13 +341,26 @@ class IngestFileWorker(SandcrawlerWorker): html_doc = HTMLParser(resource.body) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) + assert html_biblio html_body = html_extract_body_teixml(resource.body) html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('tei_xml')) - assert html_biblio + if html_scope not in ('article-fulltext', 'unknown'): + html_body.pop("tei_xml", None) + return dict( + status="html-body-wrong-scope", + html_biblio=html_biblio, + html_scope=html_scope, + ) raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules) - assert len(raw_resources) <= 200 + if len(raw_resources) > self.max_html_resources: + html_body.pop("tei_xml", None) + return dict( + status="too-many-resources", + html_biblio=html_biblio, + html_scope=html_scope, + ) when = parse_cdx_datetime(resource.cdx.datetime) @@ -355,6 +370,11 @@ class IngestFileWorker(SandcrawlerWorker): else: full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) + if self.htmlteixml_sink and html_body['status'] == "success": + self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex']) + + html_body.pop("tei_xml", None) + return dict( html_body=html_body, html_biblio=json.loads(html_biblio.json(exclude_none=True)), @@ -587,9 +607,9 @@ class IngestFileWorker(SandcrawlerWorker): info = self.process_hit(ingest_type, resource, file_meta) result.update(info) - # scope is getting calculated in process_hit() - if result.get('scope') and result['scope'] not in ('article-fulltext', 'unknown'): - result['status'] = "wrong-scope" + # check if processing turned up an error + if info.get('status') not in ('success', None): + result['status'] = info['status'] return result result['status'] = "success" -- cgit v1.2.3