aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_pdfextract.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/test_pdfextract.py')
-rw-r--r--python/tests/test_pdfextract.py71
1 files changed, 71 insertions, 0 deletions
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
new file mode 100644
index 0000000..9d75655
--- /dev/null
+++ b/python/tests/test_pdfextract.py
@@ -0,0 +1,71 @@
+import struct
+
+import poppler
+import pytest
+from test_wayback import cdx_client, wayback_client # noqa:F401
+
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
+from sandcrawler.pdfextract import process_pdf
+
+FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
+
+def test_process_fake_pdf():
+ resp = process_pdf(FAKE_PDF_BYTES)
+ print(resp)
+ assert resp.status == "not-pdf"
+
+ with open("tests/files/dummy_zip.zip", "rb") as f:
+ pdf_bytes = f.read()
+ resp = process_pdf(pdf_bytes)
+ assert resp.status == "not-pdf"
+
+
+@pytest.mark.skipif(
+ poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
+)
+def test_process_dummy_pdf():
+ with open("tests/files/dummy.pdf", "rb") as f:
+ pdf_bytes = f.read()
+ resp = process_pdf(pdf_bytes)
+ assert resp.status == "success"
+ assert resp.page0_thumbnail is not None
+ assert len(resp.text) > 10
+ assert resp.meta_xml is None
+ assert resp.file_meta["mimetype"] == "application/pdf"
+ print(resp.pdf_info)
+ print(resp.pdf_extra)
+ assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
+ # 595 x 842
+ assert resp.pdf_extra["page0_height"] == 842
+ assert resp.pdf_extra["page0_width"] == 595
+ assert resp.pdf_extra["page_count"] == 1
+
+
+def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
+
+ sink = BlackholeSink()
+ worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
+
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ worker,
+ cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ )
+ pusher_counts = pusher.run()
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
+
+def test_pdfextract_blob_worker():
+
+ sink = BlackholeSink()
+ worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
+
+ with open("tests/files/dummy.pdf", "rb") as f:
+ pdf_bytes = f.read()
+
+ worker.process(pdf_bytes)