aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/misc.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r--python/sandcrawler/misc.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
new file mode 100644
index 0000000..e13b5e7
--- /dev/null
+++ b/python/sandcrawler/misc.py
@@ -0,0 +1,43 @@
+
+import base64
+import magic
+import hashlib
+
+def gen_file_metadata(blob):
+ """
+ Takes a file blob (bytestream) and returns hashes and other metadata.
+
+ Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
+ """
+ assert blob
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ for h in hashes:
+ h.update(blob)
+ return dict(
+ size_bytes=len(blob),
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
+def b32_hex(s):
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+