aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:25:31 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:25:31 -0700
commit070962c63547b0ae5373c57164524ff95053373d (patch)
treefc80203449efcd0e012dc56fb2eb68d5448f8c66
parent3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (diff)
downloadsandcrawler-070962c63547b0ae5373c57164524ff95053373d.tar.gz
sandcrawler-070962c63547b0ae5373c57164524ff95053373d.zip
ingest_html: update trafilatura TEI-XML output kwarg
-rw-r--r--python/sandcrawler/ingest_html.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 7e6e5e3..9c72dd5 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -21,7 +21,7 @@ TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
def html_extract_body_teixml(doc: bytes) -> dict:
try:
tei_xml = trafilatura.extract(doc,
- tei_output=True,
+ output_format='xmltei',
include_comments=False,
include_formatting=True,
)