From 070962c63547b0ae5373c57164524ff95053373d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 12:25:31 -0700 Subject: ingest_html: update trafilatura TEI-XML output kwarg --- python/sandcrawler/ingest_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 7e6e5e3..9c72dd5 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -21,7 +21,7 @@ TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" def html_extract_body_teixml(doc: bytes) -> dict: try: tei_xml = trafilatura.extract(doc, - tei_output=True, + output_format='xmltei', include_comments=False, include_formatting=True, ) -- cgit v1.2.3