aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 17:17:02 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 17:17:02 -0800
commite99d9f2fddcb8b52ba52128b290ec5e0f367392f (patch)
tree4005c92820650098a72435082c47fd652537a8b2
parentab1c3cb70a9bc13ab2ff971a701aa9615c73d205 (diff)
downloadsandcrawler-e99d9f2fddcb8b52ba52128b290ec5e0f367392f.tar.gz
sandcrawler-e99d9f2fddcb8b52ba52128b290ec5e0f367392f.zip
gen_file_metadata: detect JATS XML and use application/jats+xml
-rw-r--r--python/sandcrawler/misc.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 8c91246..67e5c0b 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -26,6 +26,10 @@ def gen_file_metadata(blob: bytes) -> dict:
"""
assert blob
mimetype = magic.Magic(mime=True).from_buffer(blob)
+ if mimetype in ("application/xml", "text/xml"):
+ # crude check for JATS XML, using only first 1 kB of file
+ if b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
hashlib.sha256(),