1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
import pytest
import struct
import responses
import poppler
from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
from sandcrawler.pdfextract import process_pdf
from test_wayback import wayback_client, cdx_client
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
def test_process_fake_pdf():
resp = process_pdf(FAKE_PDF_BYTES)
print(resp)
assert resp.status == "not-pdf"
with open('tests/files/dummy_zip.zip', 'rb') as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
assert resp.status == 'not-pdf'
@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler")
def test_process_dummy_pdf():
with open('tests/files/dummy.pdf', 'rb') as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
assert resp.status == 'success'
assert resp.page0_thumbnail is not None
assert len(resp.text) > 10
assert resp.meta_xml is None
assert resp.file_meta['mimetype'] == 'application/pdf'
print(resp.pdf_info)
print(resp.pdf_extra)
assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis"
# 595 x 842
assert resp.pdf_extra['page0_height'] == 842
assert resp.pdf_extra['page0_width'] == 595
assert resp.pdf_extra['page_count'] == 1
def test_pdfextract_worker_cdx(wayback_client):
sink = BlackholeSink()
worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
with open('tests/files/example.cdx', 'r') as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
filter_mimetypes=['application/pdf'],
)
pusher_counts = pusher.run()
assert pusher_counts['total']
assert pusher_counts['pushed'] == 7
assert pusher_counts['pushed'] == worker.counts['total']
def test_pdfextract_blob_worker():
sink = BlackholeSink()
worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
with open('tests/files/dummy.pdf', 'rb') as f:
pdf_bytes = f.read()
worker.process(pdf_bytes)
|