diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 17:17:02 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 17:17:02 -0800 |
commit | e99d9f2fddcb8b52ba52128b290ec5e0f367392f (patch) | |
tree | 4005c92820650098a72435082c47fd652537a8b2 /python | |
parent | ab1c3cb70a9bc13ab2ff971a701aa9615c73d205 (diff) | |
download | sandcrawler-e99d9f2fddcb8b52ba52128b290ec5e0f367392f.tar.gz sandcrawler-e99d9f2fddcb8b52ba52128b290ec5e0f367392f.zip |
gen_file_metadata: detect JATS XML and use application/jats+xml
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/misc.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 8c91246..67e5c0b 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -26,6 +26,10 @@ def gen_file_metadata(blob: bytes) -> dict: """ assert blob mimetype = magic.Magic(mime=True).from_buffer(blob) + if mimetype in ("application/xml", "text/xml"): + # crude check for JATS XML, using only first 1 kB of file + if b"<article " in blob[:1024] and not b"<html" in blob[:1024]: + mimetype = "application/jats+xml" hashes = [ hashlib.sha1(), hashlib.sha256(), |