aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-25 12:53:41 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-25 12:53:41 -0800
commit2069332aa58bd7d5804639a1adec3cde0118a5b0 (patch)
tree86746e24d1b5e435d1cfc81d9ecfcdde6c7e43ae
parentff51a5d02fb6ab142d95eaf408a5f28e9b5f0507 (diff)
downloadsandcrawler-2069332aa58bd7d5804639a1adec3cde0118a5b0.tar.gz
sandcrawler-2069332aa58bd7d5804639a1adec3cde0118a5b0.zip
ingest: narrow xhtml filter
-rw-r--r--python/sandcrawler/ingest.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 6ec54f6..0d4e7c6 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -290,7 +290,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
file_meta = gen_file_metadata(resource.body)
- if "html" in file_meta['mimetype'] or "xml" in file_meta['mimetype']:
+ if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype']:
# Got landing page or similar. Some XHTML detected as "application/xml"
if resource.terminal_dt:
result['terminal'] = {