1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
import struct
import poppler
import pytest
from test_wayback import cdx_client, wayback_client # noqa:F401
from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
from sandcrawler.pdfextract import process_pdf
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
def test_process_fake_pdf():
resp = process_pdf(FAKE_PDF_BYTES)
print(resp)
assert resp.status == "not-pdf"
with open("tests/files/dummy_zip.zip", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
assert resp.status == "not-pdf"
@pytest.mark.skipif(
poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
)
def test_process_dummy_pdf():
with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
assert resp.status == "success"
assert resp.page0_thumbnail is not None
assert len(resp.text) > 10
assert resp.meta_xml is None
assert resp.file_meta["mimetype"] == "application/pdf"
print(resp.pdf_info)
print(resp.pdf_extra)
assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
# 595 x 842
assert resp.pdf_extra["page0_height"] == 842
assert resp.pdf_extra["page0_width"] == 595
assert resp.pdf_extra["page_count"] == 1
def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
sink = BlackholeSink()
worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
assert pusher_counts["total"]
assert pusher_counts["pushed"] == 7
assert pusher_counts["pushed"] == worker.counts["total"]
def test_pdfextract_blob_worker():
sink = BlackholeSink()
worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
worker.process(pdf_bytes)
|