From e03f4eda875f4a36a3d3e62eac467b3f2b1e0fbe Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 18:03:01 -0700 Subject: rename pdf tools to pdfextract --- python/tests/test_pdf.py | 61 ----------------------------------------- python/tests/test_pdfextract.py | 61 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 61 deletions(-) delete mode 100644 python/tests/test_pdf.py create mode 100644 python/tests/test_pdfextract.py (limited to 'python/tests') diff --git a/python/tests/test_pdf.py b/python/tests/test_pdf.py deleted file mode 100644 index 1ccf85c..0000000 --- a/python/tests/test_pdf.py +++ /dev/null @@ -1,61 +0,0 @@ - -import pytest -import struct -import responses - -from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient -from sandcrawler.pdf import process_pdf -from test_wayback import wayback_client, cdx_client - - -FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) - -def test_process_fake_pdf(): - resp = process_pdf(FAKE_PDF_BYTES) - print(resp) - assert resp.status == "not-pdf" - -def test_process_dummy_pdf(): - with open('tests/files/dummy.pdf', 'rb') as f: - pdf_bytes = f.read() - resp = process_pdf(pdf_bytes) - assert resp.status == 'success' - assert resp.page0_thumbnail is not None - assert len(resp.text) > 10 - assert resp.meta_xml is None - assert resp.file_meta['mimetype'] == 'application/pdf' - print(resp.pdf_info) - print(resp.pdf_extra) - assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis" - # 595 x 842 - assert resp.pdf_extra['height'] == 842 - assert resp.pdf_extra['width'] == 595 - assert resp.pdf_extra['page_count'] == 1 - -def test_pdfextract_worker_cdx(wayback_client): - - sink = BlackholeSink() - worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) - - with open('tests/files/example.cdx', 'r') as cdx_file: - pusher = CdxLinePusher( - worker, - cdx_file, - filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], - ) - pusher_counts = pusher.run() - assert pusher_counts['total'] - assert pusher_counts['pushed'] == 7 - assert pusher_counts['pushed'] == worker.counts['total'] - -def test_pdfextract_blob_worker(): - - sink = BlackholeSink() - worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink) - - with open('tests/files/dummy.pdf', 'rb') as f: - pdf_bytes = f.read() - - worker.process(pdf_bytes) - diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py new file mode 100644 index 0000000..1ccf85c --- /dev/null +++ b/python/tests/test_pdfextract.py @@ -0,0 +1,61 @@ + +import pytest +import struct +import responses + +from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient +from sandcrawler.pdf import process_pdf +from test_wayback import wayback_client, cdx_client + + +FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) + +def test_process_fake_pdf(): + resp = process_pdf(FAKE_PDF_BYTES) + print(resp) + assert resp.status == "not-pdf" + +def test_process_dummy_pdf(): + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + resp = process_pdf(pdf_bytes) + assert resp.status == 'success' + assert resp.page0_thumbnail is not None + assert len(resp.text) > 10 + assert resp.meta_xml is None + assert resp.file_meta['mimetype'] == 'application/pdf' + print(resp.pdf_info) + print(resp.pdf_extra) + assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis" + # 595 x 842 + assert resp.pdf_extra['height'] == 842 + assert resp.pdf_extra['width'] == 595 + assert resp.pdf_extra['page_count'] == 1 + +def test_pdfextract_worker_cdx(wayback_client): + + sink = BlackholeSink() + worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) + + with open('tests/files/example.cdx', 'r') as cdx_file: + pusher = CdxLinePusher( + worker, + cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + ) + pusher_counts = pusher.run() + assert pusher_counts['total'] + assert pusher_counts['pushed'] == 7 + assert pusher_counts['pushed'] == worker.counts['total'] + +def test_pdfextract_blob_worker(): + + sink = BlackholeSink() + worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink) + + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + + worker.process(pdf_bytes) + -- cgit v1.2.3