diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-02-25 12:53:41 -0800 |
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-25 12:53:41 -0800 |
| commit | 2069332aa58bd7d5804639a1adec3cde0118a5b0 (patch) | |
| tree | 86746e24d1b5e435d1cfc81d9ecfcdde6c7e43ae | |
| parent | ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507 (diff) | |
| download | sandcrawler-2069332aa58bd7d5804639a1adec3cde0118a5b0.tar.gz sandcrawler-2069332aa58bd7d5804639a1adec3cde0118a5b0.zip | |
ingest: narrow xhtml filter
| -rw-r--r-- | python/sandcrawler/ingest.py | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 6ec54f6..0d4e7c6 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -290,7 +290,7 @@ class IngestFileWorker(SandcrawlerWorker): return result file_meta = gen_file_metadata(resource.body) - if "html" in file_meta['mimetype'] or "xml" in file_meta['mimetype']: + if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype']: # Got landing page or similar. Some XHTML detected as "application/xml" if resource.terminal_dt: result['terminal'] = { |
