From e99d9f2fddcb8b52ba52128b290ec5e0f367392f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 17:17:02 -0800 Subject: gen_file_metadata: detect JATS XML and use application/jats+xml --- python/sandcrawler/misc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 8c91246..67e5c0b 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -26,6 +26,10 @@ def gen_file_metadata(blob: bytes) -> dict: """ assert blob mimetype = magic.Magic(mime=True).from_buffer(blob) + if mimetype in ("application/xml", "text/xml"): + # crude check for JATS XML, using only first 1 kB of file + if b"