From 206969ccebb5007b6c687edd6e09b5c4910e0152 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 6 Oct 2021 15:13:03 -0700 Subject: local-file version of gen_file_metadata --- python/sandcrawler/__init__.py | 2 +- python/sandcrawler/misc.py | 43 +++++++++++++++++++++++++++++++++++++++++- python/tests/test_misc.py | 14 +++++++++++++- 3 files changed, 56 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 724a39c..4e004be 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,7 +1,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url +from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest_file import IngestFileWorker diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index a3e2960..dc46e9a 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -44,7 +44,10 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: assert blob is not None if not allow_empty: assert blob - mimetype = magic.Magic(mime=True).from_buffer(blob) + if len(blob) < 1024*1024: + mimetype = magic.Magic(mime=True).from_buffer(blob) + else: + mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024*1024)]) if mimetype in ("application/xml", "text/xml"): # crude checks for XHTML or JATS XML, using only first 1 kB of file if b" dict: mimetype=mimetype, ) +def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict: + """ + Variant of gen_file_metadata() which works with files on local disk + """ + assert path is not None + mimetype = magic.Magic(mime=True).from_file(path) + if mimetype in ("application/xml", "text/xml"): + with open(path, 'rb') as f: + blob = f.read(1024) + # crude checks for XHTML or JATS XML, using only first 1 kB of file + if b" 0 + return dict( + size_bytes=size_bytes, + sha1hex=hashes[0].hexdigest(), + sha256hex=hashes[1].hexdigest(), + md5hex=hashes[2].hexdigest(), + mimetype=mimetype, + ) + def b32_hex(s: str) -> str: """ Converts a base32-encoded SHA-1 checksum into hex-encoded diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 29f9e9f..bd18e5c 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,7 +1,7 @@ import pytest -from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url +from sandcrawler import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, clean_url def test_gen_file_metadata(): @@ -26,6 +26,18 @@ def test_gen_file_metadata(): assert fm['mimetype'] == 'text/plain' assert fm['size_bytes'] == 8 +def test_gen_file_metadata_path(): + + # valid (but very small) PDF file + file_meta = gen_file_metadata_path('tests/files/dummy.pdf') + assert file_meta == { + 'mimetype': 'application/pdf', + 'md5hex': '2942bfabb3d05332b66eb128e0842cff', + 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36', + 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4', + 'size_bytes': 13264, + } + def test_b32_hex(): # valid b32 -- cgit v1.2.3