From 5a9e8d9441662c508cf583114b9edc85cc608587 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 14:16:53 -0800 Subject: ingest: retain html_biblio through hops; all ingest types --- python/sandcrawler/ingest.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 028f2b2..5d31d62 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -257,7 +257,11 @@ class IngestFileWorker(SandcrawlerWorker): 'xml_meta': self.process_xml(resource, file_meta), } elif ingest_type == "html": - return self.process_html(resource, file_meta) + html_info = self.process_html(resource, file_meta) + # if there is no html_biblio, don't clobber anything possibly extracted earlier + if 'html_biblio' in html_info and not html_info['html_biblio']: + html_info.pop('html_biblio') + return html_info else: raise NotImplementedError(f"process {ingest_type} hit") @@ -559,6 +563,14 @@ class IngestFileWorker(SandcrawlerWorker): or "application/xml" in file_meta['mimetype'] or "text/xml" in file_meta['mimetype'] ) + html_biblio = None + if html_ish_resource and resource.body: + html_doc = HTMLParser(resource.body) + html_biblio = html_extract_biblio(resource.terminal_url, html_doc) + if html_biblio and html_biblio.title: + result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True)) + #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) + if ingest_type == "pdf" and html_ish_resource: # Got landing page or similar. Some XHTML detected as "application/xml" fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) -- cgit v1.2.3