diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:25:31 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:25:31 -0700 |
commit | 070962c63547b0ae5373c57164524ff95053373d (patch) | |
tree | fc80203449efcd0e012dc56fb2eb68d5448f8c66 | |
parent | 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (diff) | |
download | sandcrawler-070962c63547b0ae5373c57164524ff95053373d.tar.gz sandcrawler-070962c63547b0ae5373c57164524ff95053373d.zip |
ingest_html: update trafilatura TEI-XML output kwarg
-rw-r--r-- | python/sandcrawler/ingest_html.py | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 7e6e5e3..9c72dd5 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -21,7 +21,7 @@ TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" def html_extract_body_teixml(doc: bytes) -> dict: try: tei_xml = trafilatura.extract(doc, - tei_output=True, + output_format='xmltei', include_comments=False, include_formatting=True, ) |