aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-06 15:13:03 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit206969ccebb5007b6c687edd6e09b5c4910e0152 (patch)
treeec050216c43e69e972d48a702f933546c90e1459
parentb3447503c0aa2e326ce1e46c993be28f907ec23b (diff)
downloadsandcrawler-206969ccebb5007b6c687edd6e09b5c4910e0152.tar.gz
sandcrawler-206969ccebb5007b6c687edd6e09b5c4910e0152.zip
local-file version of gen_file_metadata
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/sandcrawler/misc.py43
-rw-r--r--python/tests/test_misc.py14
3 files changed, 56 insertions, 3 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 724a39c..4e004be 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,7 +1,7 @@
from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
+from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest_file import IngestFileWorker
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index a3e2960..dc46e9a 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -44,7 +44,10 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
assert blob is not None
if not allow_empty:
assert blob
- mimetype = magic.Magic(mime=True).from_buffer(blob)
+ if len(blob) < 1024*1024:
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ else:
+ mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024*1024)])
if mimetype in ("application/xml", "text/xml"):
# crude checks for XHTML or JATS XML, using only first 1 kB of file
if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
@@ -66,6 +69,44 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
mimetype=mimetype,
)
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+ """
+ Variant of gen_file_metadata() which works with files on local disk
+ """
+ assert path is not None
+ mimetype = magic.Magic(mime=True).from_file(path)
+ if mimetype in ("application/xml", "text/xml"):
+ with open(path, 'rb') as f:
+ blob = f.read(1024)
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ mimetype = "application/jats+xml"
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ size_bytes = 0
+ with open(path, 'rb') as f:
+ while True:
+ chunk = f.read(1024*1024)
+ if not chunk:
+ break
+ size_bytes += len(chunk)
+ for h in hashes:
+ h.update(chunk)
+ if not allow_empty:
+ assert size_bytes > 0
+ return dict(
+ size_bytes=size_bytes,
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
def b32_hex(s: str) -> str:
"""
Converts a base32-encoded SHA-1 checksum into hex-encoded
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 29f9e9f..bd18e5c 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
+from sandcrawler import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, clean_url
def test_gen_file_metadata():
@@ -26,6 +26,18 @@ def test_gen_file_metadata():
assert fm['mimetype'] == 'text/plain'
assert fm['size_bytes'] == 8
+def test_gen_file_metadata_path():
+
+ # valid (but very small) PDF file
+ file_meta = gen_file_metadata_path('tests/files/dummy.pdf')
+ assert file_meta == {
+ 'mimetype': 'application/pdf',
+ 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
+ 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
+ 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
+ 'size_bytes': 13264,
+ }
+
def test_b32_hex():
# valid b32