diff options
| -rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
| -rw-r--r-- | python/sandcrawler/misc.py | 43 | ||||
| -rw-r--r-- | python/tests/test_misc.py | 14 | 
3 files changed, 56 insertions, 3 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 724a39c..4e004be 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,7 +1,7 @@  from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker  from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url +from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url  from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper  from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow  from .ingest_file import IngestFileWorker diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index a3e2960..dc46e9a 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -44,7 +44,10 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:      assert blob is not None      if not allow_empty:          assert blob -    mimetype = magic.Magic(mime=True).from_buffer(blob) +    if len(blob) < 1024*1024: +        mimetype = magic.Magic(mime=True).from_buffer(blob) +    else: +        mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024*1024)])      if mimetype in ("application/xml", "text/xml"):          # crude checks for XHTML or JATS XML, using only first 1 kB of file          if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]: @@ -66,6 +69,44 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:          mimetype=mimetype,      ) +def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict: +    """ +    Variant of gen_file_metadata() which works with files on local disk +    """ +    assert path is not None +    mimetype = magic.Magic(mime=True).from_file(path) +    if mimetype in ("application/xml", "text/xml"): +        with open(path, 'rb') as f: +            blob = f.read(1024) +            # crude checks for XHTML or JATS XML, using only first 1 kB of file +            if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]: +                mimetype = "application/xhtml+xml" +            elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]: +                mimetype = "application/jats+xml" +    hashes = [ +        hashlib.sha1(), +        hashlib.sha256(), +        hashlib.md5(), +    ] +    size_bytes = 0 +    with open(path, 'rb') as f: +        while True: +            chunk = f.read(1024*1024) +            if not chunk: +                break +            size_bytes += len(chunk) +            for h in hashes: +                h.update(chunk) +    if not allow_empty: +        assert size_bytes > 0 +    return dict( +        size_bytes=size_bytes, +        sha1hex=hashes[0].hexdigest(), +        sha256hex=hashes[1].hexdigest(), +        md5hex=hashes[2].hexdigest(), +        mimetype=mimetype, +    ) +  def b32_hex(s: str) -> str:      """      Converts a base32-encoded SHA-1 checksum into hex-encoded diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 29f9e9f..bd18e5c 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,7 +1,7 @@  import pytest -from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url +from sandcrawler import gen_file_metadata, gen_file_metadata_path,  b32_hex, parse_cdx_line, clean_url  def test_gen_file_metadata(): @@ -26,6 +26,18 @@ def test_gen_file_metadata():      assert fm['mimetype'] == 'text/plain'      assert fm['size_bytes'] == 8 +def test_gen_file_metadata_path(): +     +    # valid (but very small) PDF file +    file_meta = gen_file_metadata_path('tests/files/dummy.pdf') +    assert file_meta == { +        'mimetype': 'application/pdf', +        'md5hex': '2942bfabb3d05332b66eb128e0842cff', +        'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36', +        'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4', +        'size_bytes': 13264, +    } +  def test_b32_hex():      # valid b32  | 
