aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_pdf.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-17 18:03:01 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-17 18:03:01 -0700
commite03f4eda875f4a36a3d3e62eac467b3f2b1e0fbe (patch)
tree7c97d66ae764456819809a0ecf3c06b2156e1e2c /python/tests/test_pdf.py
parent1acacd8f7e8200fa9da6d9dd4754f5ac31c9b36f (diff)
downloadsandcrawler-e03f4eda875f4a36a3d3e62eac467b3f2b1e0fbe.tar.gz
sandcrawler-e03f4eda875f4a36a3d3e62eac467b3f2b1e0fbe.zip
rename pdf tools to pdfextract
Diffstat (limited to 'python/tests/test_pdf.py')
-rw-r--r--python/tests/test_pdf.py61
1 files changed, 0 insertions, 61 deletions
diff --git a/python/tests/test_pdf.py b/python/tests/test_pdf.py
deleted file mode 100644
index 1ccf85c..0000000
--- a/python/tests/test_pdf.py
+++ /dev/null
@@ -1,61 +0,0 @@
-
-import pytest
-import struct
-import responses
-
-from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
-from sandcrawler.pdf import process_pdf
-from test_wayback import wayback_client, cdx_client
-
-
-FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
-
-def test_process_fake_pdf():
- resp = process_pdf(FAKE_PDF_BYTES)
- print(resp)
- assert resp.status == "not-pdf"
-
-def test_process_dummy_pdf():
- with open('tests/files/dummy.pdf', 'rb') as f:
- pdf_bytes = f.read()
- resp = process_pdf(pdf_bytes)
- assert resp.status == 'success'
- assert resp.page0_thumbnail is not None
- assert len(resp.text) > 10
- assert resp.meta_xml is None
- assert resp.file_meta['mimetype'] == 'application/pdf'
- print(resp.pdf_info)
- print(resp.pdf_extra)
- assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis"
- # 595 x 842
- assert resp.pdf_extra['height'] == 842
- assert resp.pdf_extra['width'] == 595
- assert resp.pdf_extra['page_count'] == 1
-
-def test_pdfextract_worker_cdx(wayback_client):
-
- sink = BlackholeSink()
- worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
-
- with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(
- worker,
- cdx_file,
- filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
- )
- pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
-
-def test_pdfextract_blob_worker():
-
- sink = BlackholeSink()
- worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
-
- with open('tests/files/dummy.pdf', 'rb') as f:
- pdf_bytes = f.read()
-
- worker.process(pdf_bytes)
-