3 files changed, 56 insertions, 3 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 724a39c..4e004be 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,7 +1,7 @@
 
 from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
 from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
+from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
 from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
 from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
 from .ingest_file import IngestFileWorker
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index a3e2960..dc46e9a 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -44,7 +44,10 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
     assert blob is not None
     if not allow_empty:
         assert blob
-    mimetype = magic.Magic(mime=True).from_buffer(blob)
+    if len(blob) < 1024*1024:
+        mimetype = magic.Magic(mime=True).from_buffer(blob)
+    else:
+        mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024*1024)])
     if mimetype in ("application/xml", "text/xml"):
         # crude checks for XHTML or JATS XML, using only first 1 kB of file
         if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
@@ -66,6 +69,44 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
         mimetype=mimetype,
     )
 
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+    """
+    Variant of gen_file_metadata() which works with files on local disk
+    """
+    assert path is not None
+    mimetype = magic.Magic(mime=True).from_file(path)
+    if mimetype in ("application/xml", "text/xml"):
+        with open(path, 'rb') as f:
+            blob = f.read(1024)
+            # crude checks for XHTML or JATS XML, using only first 1 kB of file
+            if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+                mimetype = "application/xhtml+xml"
+            elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+                mimetype = "application/jats+xml"
+    hashes = [
+        hashlib.sha1(),
+        hashlib.sha256(),
+        hashlib.md5(),
+    ]
+    size_bytes = 0
+    with open(path, 'rb') as f:
+        while True:
+            chunk = f.read(1024*1024)
+            if not chunk:
+                break
+            size_bytes += len(chunk)
+            for h in hashes:
+                h.update(chunk)
+    if not allow_empty:
+        assert size_bytes > 0
+    return dict(
+        size_bytes=size_bytes,
+        sha1hex=hashes[0].hexdigest(),
+        sha256hex=hashes[1].hexdigest(),
+        md5hex=hashes[2].hexdigest(),
+        mimetype=mimetype,
+    )
+
 def b32_hex(s: str) -> str:
     """
     Converts a base32-encoded SHA-1 checksum into hex-encoded
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 29f9e9f..bd18e5c 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
 
 import pytest
 
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
+from sandcrawler import gen_file_metadata, gen_file_metadata_path,  b32_hex, parse_cdx_line, clean_url
 
 def test_gen_file_metadata():
     
@@ -26,6 +26,18 @@ def test_gen_file_metadata():
     assert fm['mimetype'] == 'text/plain'
     assert fm['size_bytes'] == 8
 
+def test_gen_file_metadata_path():
+    
+    # valid (but very small) PDF file
+    file_meta = gen_file_metadata_path('tests/files/dummy.pdf')
+    assert file_meta == {
+        'mimetype': 'application/pdf',
+        'md5hex': '2942bfabb3d05332b66eb128e0842cff',
+        'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
+        'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
+        'size_bytes': 13264,
+    }
+
 def test_b32_hex():
 
     # valid b32