aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 14:16:53 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 14:16:53 -0800
commit5a9e8d9441662c508cf583114b9edc85cc608587 (patch)
tree32c927e01a528d33f4fe6f290b355ccea224ba58 /python
parenteabef14c79fb36e6076c215887b69630c482a729 (diff)
downloadsandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.tar.gz
sandcrawler-5a9e8d9441662c508cf583114b9edc85cc608587.zip
ingest: retain html_biblio through hops; all ingest types
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 028f2b2..5d31d62 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -257,7 +257,11 @@ class IngestFileWorker(SandcrawlerWorker):
'xml_meta': self.process_xml(resource, file_meta),
}
elif ingest_type == "html":
- return self.process_html(resource, file_meta)
+ html_info = self.process_html(resource, file_meta)
+ # if there is no html_biblio, don't clobber anything possibly extracted earlier
+ if 'html_biblio' in html_info and not html_info['html_biblio']:
+ html_info.pop('html_biblio')
+ return html_info
else:
raise NotImplementedError(f"process {ingest_type} hit")
@@ -559,6 +563,14 @@ class IngestFileWorker(SandcrawlerWorker):
or "application/xml" in file_meta['mimetype']
or "text/xml" in file_meta['mimetype']
)
+ html_biblio = None
+ if html_ish_resource and resource.body:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio and html_biblio.title:
+ result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
+ #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+
if ingest_type == "pdf" and html_ish_resource:
# Got landing page or similar. Some XHTML detected as "application/xml"
fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)