aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r--python/sandcrawler/html_ingest.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index e86fa2b..acd336e 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -25,6 +25,9 @@ def html_extract_fulltext_teixml(doc: bytes) -> dict:
)
if tei_xml:
return dict(status="success", tei_xml=tei_xml)
+ elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+ # hack for firstmonday.org
+ return html_extract_fulltext_teixml(doc[106:])
else:
return dict(status="empty-xml")