diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:16:53 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:16:53 -0800 |
commit | 5a9e8d9441662c508cf583114b9edc85cc608587 (patch) | |
tree | 32c927e01a528d33f4fe6f290b355ccea224ba58 | |
parent | eabef14c79fb36e6076c215887b69630c482a729 (diff) | |
download | sandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.tar.gz sandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.zip |
ingest: retain html_biblio through hops; all ingest types
-rw-r--r-- | python/sandcrawler/ingest.py | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 028f2b2..5d31d62 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -257,7 +257,11 @@ class IngestFileWorker(SandcrawlerWorker): 'xml_meta': self.process_xml(resource, file_meta), } elif ingest_type == "html": - return self.process_html(resource, file_meta) + html_info = self.process_html(resource, file_meta) + # if there is no html_biblio, don't clobber anything possibly extracted earlier + if 'html_biblio' in html_info and not html_info['html_biblio']: + html_info.pop('html_biblio') + return html_info else: raise NotImplementedError(f"process {ingest_type} hit") @@ -559,6 +563,14 @@ class IngestFileWorker(SandcrawlerWorker): or "application/xml" in file_meta['mimetype'] or "text/xml" in file_meta['mimetype'] ) + html_biblio = None + if html_ish_resource and resource.body: + html_doc = HTMLParser(resource.body) + html_biblio = html_extract_biblio(resource.terminal_url, html_doc) + if html_biblio and html_biblio.title: + result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True)) + #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) + if ingest_type == "pdf" and html_ish_resource: # Got landing page or similar. Some XHTML detected as "application/xml" fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) |