aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index aedf2ff..c9b3d2f 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -554,7 +554,7 @@ class IngestFileWorker(SandcrawlerWorker):
# here we split based on ingest type to try and extract a next hop
html_ish_resource = bool(
"html" in file_meta['mimetype']
- or "xhtml" in file_meta['mimetype']
+ or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml"
or "application/xml" in file_meta['mimetype']
or "text/xml" in file_meta['mimetype']
)
@@ -653,7 +653,7 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "wrong-mimetype"
return result
elif ingest_type == "html":
- if file_meta['mimetype'] not in ("text/html",):
+ if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
result['status'] = "wrong-mimetype"
return result
else: