aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/misc.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-06 15:13:03 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit206969ccebb5007b6c687edd6e09b5c4910e0152 (patch)
treeec050216c43e69e972d48a702f933546c90e1459 /python/sandcrawler/misc.py
parentb3447503c0aa2e326ce1e46c993be28f907ec23b (diff)
downloadsandcrawler-206969ccebb5007b6c687edd6e09b5c4910e0152.tar.gz
sandcrawler-206969ccebb5007b6c687edd6e09b5c4910e0152.zip
local-file version of gen_file_metadata
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r--python/sandcrawler/misc.py43
1 files changed, 42 insertions, 1 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index a3e2960..dc46e9a 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -44,7 +44,10 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
assert blob is not None
if not allow_empty:
assert blob
- mimetype = magic.Magic(mime=True).from_buffer(blob)
+ if len(blob) < 1024*1024:
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ else:
+ mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024*1024)])
if mimetype in ("application/xml", "text/xml"):
# crude checks for XHTML or JATS XML, using only first 1 kB of file
if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
@@ -66,6 +69,44 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
mimetype=mimetype,
)
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+ """
+ Variant of gen_file_metadata() which works with files on local disk
+ """
+ assert path is not None
+ mimetype = magic.Magic(mime=True).from_file(path)
+ if mimetype in ("application/xml", "text/xml"):
+ with open(path, 'rb') as f:
+ blob = f.read(1024)
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ mimetype = "application/jats+xml"
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ size_bytes = 0
+ with open(path, 'rb') as f:
+ while True:
+ chunk = f.read(1024*1024)
+ if not chunk:
+ break
+ size_bytes += len(chunk)
+ for h in hashes:
+ h.update(chunk)
+ if not allow_empty:
+ assert size_bytes > 0
+ return dict(
+ size_bytes=size_bytes,
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
def b32_hex(s: str) -> str:
"""
Converts a base32-encoded SHA-1 checksum into hex-encoded