ingest: retain html_biblio through hops; all ingest types

author: Bryan Newbold <bnewbold@archive.org> 2020-11-08 14:16:53 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-08 14:16:53 -0800
commit: 5a9e8d9441662c508cf583114b9edc85cc608587 (patch)
tree: 32c927e01a528d33f4fe6f290b355ccea224ba58
parent: eabef14c79fb36e6076c215887b69630c482a729 (diff)
download: sandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.tar.gz
sandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.zip
1 files changed, 13 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 028f2b2..5d31d62 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -257,7 +257,11 @@ class IngestFileWorker(SandcrawlerWorker):
                 'xml_meta': self.process_xml(resource, file_meta),
             }
         elif ingest_type == "html":
-            return self.process_html(resource, file_meta)
+            html_info = self.process_html(resource, file_meta)
+            # if there is no html_biblio, don't clobber anything possibly extracted earlier
+            if 'html_biblio' in html_info and not html_info['html_biblio']:
+                html_info.pop('html_biblio')
+            return html_info
         else:
             raise NotImplementedError(f"process {ingest_type} hit")
 
@@ -559,6 +563,14 @@ class IngestFileWorker(SandcrawlerWorker):
                 or "application/xml" in file_meta['mimetype']
                 or "text/xml" in file_meta['mimetype']
             )
+            html_biblio = None
+            if html_ish_resource and resource.body:
+                html_doc = HTMLParser(resource.body)
+                html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+                if html_biblio and html_biblio.title:
+                    result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
+                    #print(f"  setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+
             if ingest_type == "pdf" and html_ish_resource:
                 # Got landing page or similar. Some XHTML detected as "application/xml"
                 fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
author	Bryan Newbold <bnewbold@archive.org>	2020-11-08 14:16:53 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-08 14:16:53 -0800
commit	5a9e8d9441662c508cf583114b9edc85cc608587 (patch)
tree	32c927e01a528d33f4fe6f290b355ccea224ba58
parent	eabef14c79fb36e6076c215887b69630c482a729 (diff)
download	sandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.tar.gz sandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.zip