diff options
-rw-r--r-- | python/sandcrawler/pdf.py | 15 | ||||
-rw-r--r-- | python/tests/test_pdf.py | 61 |
2 files changed, 70 insertions, 6 deletions
diff --git a/python/sandcrawler/pdf.py b/python/sandcrawler/pdf.py index b9baed7..cfba679 100644 --- a/python/sandcrawler/pdf.py +++ b/python/sandcrawler/pdf.py @@ -1,5 +1,6 @@ import sys +import datetime from io import BytesIO from dataclasses import dataclass from typing import Optional, Dict, Any @@ -87,23 +88,24 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr pageN = pdf.create_page(n) full_text += pageN.text() pdf_info = pdf.infos() - # Is this actually needed? or does json marshalling work automatically? - #for k in pdf_info.keys(): - # if isinstance(pdf_info[k], datetime.datetime): - # pdf_info[k] = datetime.datetime.isoformat(pdf_info[k]) + # TODO: is this actually needed? or does json marshalling work automatically? + for k in pdf_info.keys(): + if isinstance(pdf_info[k], datetime.datetime): + pdf_info[k] = datetime.datetime.isoformat(pdf_info[k]) return PdfExtractResult( sha1hex=sha1hex, file_meta=file_meta, status='success', error_msg=None, - text=full_text, + text=full_text or None, page0_thumbnail=page0_thumbnail, - meta_xml=pdf.metadata, + meta_xml=pdf.metadata or None, pdf_info=pdf.infos(), pdf_extra=dict( height=page0rect.height, width=page0rect.width, + page_count=pdf.pages, permanent_id=pdf.pdf_id.permanent_id, update_id=pdf.pdf_id.update_id, pdf_version=f"{pdf.pdf_version[0]}.{pdf.pdf_version[1]}", @@ -155,6 +157,7 @@ class PdfExtractBlobWorker(SandcrawlerWorker): def process(self, blob, key: Optional[str] = None): if not blob: return None + assert isinstance(blob, bytes) result = process_pdf(blob) if self.thumbnail_sink and result.page0_thumbnail is not None: diff --git a/python/tests/test_pdf.py b/python/tests/test_pdf.py new file mode 100644 index 0000000..1ccf85c --- /dev/null +++ b/python/tests/test_pdf.py @@ -0,0 +1,61 @@ + +import pytest +import struct +import responses + +from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient +from sandcrawler.pdf import process_pdf +from test_wayback import wayback_client, cdx_client + + +FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) + +def test_process_fake_pdf(): + resp = process_pdf(FAKE_PDF_BYTES) + print(resp) + assert resp.status == "not-pdf" + +def test_process_dummy_pdf(): + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + resp = process_pdf(pdf_bytes) + assert resp.status == 'success' + assert resp.page0_thumbnail is not None + assert len(resp.text) > 10 + assert resp.meta_xml is None + assert resp.file_meta['mimetype'] == 'application/pdf' + print(resp.pdf_info) + print(resp.pdf_extra) + assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis" + # 595 x 842 + assert resp.pdf_extra['height'] == 842 + assert resp.pdf_extra['width'] == 595 + assert resp.pdf_extra['page_count'] == 1 + +def test_pdfextract_worker_cdx(wayback_client): + + sink = BlackholeSink() + worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) + + with open('tests/files/example.cdx', 'r') as cdx_file: + pusher = CdxLinePusher( + worker, + cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + ) + pusher_counts = pusher.run() + assert pusher_counts['total'] + assert pusher_counts['pushed'] == 7 + assert pusher_counts['pushed'] == worker.counts['total'] + +def test_pdfextract_blob_worker(): + + sink = BlackholeSink() + worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink) + + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + + worker.process(pdf_bytes) + |