From b438f52dbb7578c9a5c2153bc4ba50e33fdae7c3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Sep 2019 22:58:55 -0700 Subject: start refactoring sandcrawler python common code --- python/sandcrawler/misc.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 python/sandcrawler/misc.py (limited to 'python/sandcrawler/misc.py') diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py new file mode 100644 index 0000000..e13b5e7 --- /dev/null +++ b/python/sandcrawler/misc.py @@ -0,0 +1,43 @@ + +import base64 +import magic +import hashlib + +def gen_file_metadata(blob): + """ + Takes a file blob (bytestream) and returns hashes and other metadata. + + Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype + """ + assert blob + mimetype = magic.Magic(mime=True).from_buffer(blob) + hashes = [ + hashlib.sha1(), + hashlib.sha256(), + hashlib.md5(), + ] + for h in hashes: + h.update(blob) + return dict( + size_bytes=len(blob), + sha1hex=hashes[0].hexdigest(), + sha256hex=hashes[1].hexdigest(), + md5hex=hashes[2].hexdigest(), + mimetype=mimetype, + ) + +def b32_hex(s): + """ + Converts a base32-encoded SHA-1 checksum into hex-encoded + + base32 checksums are used by, eg, heritrix and in wayback CDX files + """ + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + if len(s) == 40: + return s + raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + -- cgit v1.2.3