aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-30 17:20:22 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-30 17:20:22 -0700
commite61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (patch)
tree1a59b220e4f984b2a805b23961e4bd8a150c0f47 /python/sandcrawler/html_ingest.py
parenta8387ac21bf6f9693cef24f9ef39482b9337f3af (diff)
downloadsandcrawler-e61d6e8cc3b6824816a83dff56ffbdbbb6329e57.tar.gz
sandcrawler-e61d6e8cc3b6824816a83dff56ffbdbbb6329e57.zip
html: work around firstmonday DOCTYPE issue
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r--python/sandcrawler/html_ingest.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index e86fa2b..acd336e 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -25,6 +25,9 @@ def html_extract_fulltext_teixml(doc: bytes) -> dict:
)
if tei_xml:
return dict(status="success", tei_xml=tei_xml)
+ elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+ # hack for firstmonday.org
+ return html_extract_fulltext_teixml(doc[106:])
else:
return dict(status="empty-xml")