diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 17:20:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 17:20:22 -0700 |
commit | e61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (patch) | |
tree | 1a59b220e4f984b2a805b23961e4bd8a150c0f47 /python/sandcrawler | |
parent | a8387ac21bf6f9693cef24f9ef39482b9337f3af (diff) | |
download | sandcrawler-e61d6e8cc3b6824816a83dff56ffbdbbb6329e57.tar.gz sandcrawler-e61d6e8cc3b6824816a83dff56ffbdbbb6329e57.zip |
html: work around firstmonday DOCTYPE issue
Diffstat (limited to 'python/sandcrawler')
-rw-r--r-- | python/sandcrawler/html_ingest.py | 3 |
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index e86fa2b..acd336e 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -25,6 +25,9 @@ def html_extract_fulltext_teixml(doc: bytes) -> dict: ) if tei_xml: return dict(status="success", tei_xml=tei_xml) + elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'): + # hack for firstmonday.org + return html_extract_fulltext_teixml(doc[106:]) else: return dict(status="empty-xml") |