From ee6129ea884036b666de7cff4ad7891675a52b3c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Sep 2020 14:13:34 -0700 Subject: ingest: treat text/xml as XHTML in pdf ingest --- python/sandcrawler/ingest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index c45437d..e8b3551 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -429,7 +429,7 @@ class IngestFileWorker(SandcrawlerWorker): ) file_meta = gen_file_metadata(resource.body) - if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype']: + if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype'] or "text/xml" in file_meta['mimetype']: # Got landing page or similar. Some XHTML detected as "application/xml" if resource.terminal_dt: result['terminal'] = { -- cgit v1.2.3