aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-14 14:13:34 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-14 14:13:34 -0700
commitee6129ea884036b666de7cff4ad7891675a52b3c (patch)
treef3f2d4970f2622b16425eab7ae0de2eacac30ef5 /python
parent62252a6179953ccc79a6cb60c40a756fa0a034e1 (diff)
downloadsandcrawler-ee6129ea884036b666de7cff4ad7891675a52b3c.tar.gz
sandcrawler-ee6129ea884036b666de7cff4ad7891675a52b3c.zip
ingest: treat text/xml as XHTML in pdf ingest
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index c45437d..e8b3551 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -429,7 +429,7 @@ class IngestFileWorker(SandcrawlerWorker):
)
file_meta = gen_file_metadata(resource.body)
- if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype']:
+ if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype'] or "text/xml" in file_meta['mimetype']:
# Got landing page or similar. Some XHTML detected as "application/xml"
if resource.terminal_dt:
result['terminal'] = {