From e03f4eda875f4a36a3d3e62eac467b3f2b1e0fbe Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 18:03:01 -0700 Subject: rename pdf tools to pdfextract --- python/pdf_tool.py | 137 -------------------------------- python/pdfextract_tool.py | 137 ++++++++++++++++++++++++++++++++ python/sandcrawler/pdf.py | 167 --------------------------------------- python/sandcrawler/pdfextract.py | 167 +++++++++++++++++++++++++++++++++++++++ python/tests/test_pdf.py | 61 -------------- python/tests/test_pdfextract.py | 61 ++++++++++++++ 6 files changed, 365 insertions(+), 365 deletions(-) delete mode 100755 python/pdf_tool.py create mode 100755 python/pdfextract_tool.py delete mode 100644 python/sandcrawler/pdf.py create mode 100644 python/sandcrawler/pdfextract.py delete mode 100644 python/tests/test_pdf.py create mode 100644 python/tests/test_pdfextract.py diff --git a/python/pdf_tool.py b/python/pdf_tool.py deleted file mode 100755 index ed8c2be..0000000 --- a/python/pdf_tool.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python3 - -""" -These are generally for running one-off tasks from the command line. Output -might go to stdout, or might go to Kafka topic. - -Example of large parallel run, locally: - - cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - -""" - -import sys -import json -import argparse -import datetime - -from grobid2json import teixml2json -from sandcrawler import * - - -def run_extract_json(args): - wayback_client = WaybackClient() - if args.jobs > 1: - worker = PdfExtractWorker(wayback_client, sink=None, thumbnail_sink=args.thumbnail_sink) - multi_worker = MultiprocessWrapper(worker, args.sink) - pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs) - else: - worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink) - pusher = JsonLinePusher(worker, args.json_file) - pusher.run() - -def run_extract_cdx(args): - wayback_client = WaybackClient() - if args.jobs > 1: - worker = PdfExtractWorker(wayback_client, sink=None) - multi_worker = MultiprocessWrapper(worker, args.sink) - pusher = CdxLinePusher( - multi_worker, - args.cdx_file, - filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], - batch_size=args.jobs, - ) - else: - worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink) - pusher = CdxLinePusher( - worker, - args.cdx_file, - filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], - ) - pusher.run() - -def run_extract_zipfile(args): - if args.jobs > 1: - print("multi-processing: {}".format(args.jobs), file=sys.stderr) - worker = PdfExtractBlobWorker(sink=None, thumbnail_sink=None) - multi_worker = MultiprocessWrapper(worker, args.sink, jobs=args.jobs) - pusher = ZipfilePusher(multi_worker, args.zip_file, batch_size=args.jobs) - else: - worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=None) - pusher = ZipfilePusher(worker, args.zip_file) - pusher.run() - -def run_single(args): - worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=None) - with open(args.pdf_file, 'rb') as pdf_file: - result = worker.process(pdf_file.open()) - print(json.dumps(result, sort_keys=True)) - - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--kafka-mode', - action='store_true', - help="send output to Kafka (not stdout)") - parser.add_argument('--kafka-hosts', - default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--kafka-env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('-j', '--jobs', - default=8, type=int, - help="parallelism for batch CPU jobs") - subparsers = parser.add_subparsers() - - sub_extract_json = subparsers.add_parser('extract-json', - help="for each JSON line with CDX info, fetches PDF and does PDF extraction") - sub_extract_json.set_defaults(func=run_extract_json) - sub_extract_json.add_argument('json_file', - help="JSON file to import from (or '-' for stdin)", - type=argparse.FileType('r')) - - sub_extract_cdx = subparsers.add_parser('extract-cdx', - help="for each CDX line, fetches PDF and does PDF extraction") - sub_extract_cdx.set_defaults(func=run_extract_cdx) - sub_extract_cdx.add_argument('cdx_file', - help="CDX file to import from (or '-' for stdin)", - type=argparse.FileType('r')) - - sub_extract_zipfile = subparsers.add_parser('extract-zipfile', - help="opens zipfile, iterates over PDF files inside and does PDF extract for each") - sub_extract_zipfile.set_defaults(func=run_extract_zipfile) - sub_extract_zipfile.add_argument('zip_file', - help="zipfile with PDFs to extract", - type=str) - - sub_single = subparsers.add_parser('single', - help="opens single PDF and extracts it") - sub_single.set_defaults(func=run_single) - sub_single.add_argument('pdf_file', - help="single PDF to extract", - type=str) - - args = parser.parse_args() - if not args.__dict__.get("func"): - print("tell me what to do!", file=sys.stderr) - sys.exit(-1) - - args.text_sink = None - args.thumbnail_sink = None - if args.kafka_mode: - text_topic = "sandcrawler-{}.pdftext".format(args.kafka_env) - thumbnail_topic = "sandcrawler-{}.thumbnail-180px-jpeg".format(args.kafka_env) - args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, - produce_topic=text_topic) - args.thumbnail_sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, - produce_topic=thumbnail_topic) - print("Running in kafka output mode, publishing to {} and {}\n".format( - text_topic, thumbnail_topic), file=sys.stderr) - - args.func(args) - -if __name__ == '__main__': - main() diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py new file mode 100755 index 0000000..ed8c2be --- /dev/null +++ b/python/pdfextract_tool.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +""" +These are generally for running one-off tasks from the command line. Output +might go to stdout, or might go to Kafka topic. + +Example of large parallel run, locally: + + cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - +""" + +import sys +import json +import argparse +import datetime + +from grobid2json import teixml2json +from sandcrawler import * + + +def run_extract_json(args): + wayback_client = WaybackClient() + if args.jobs > 1: + worker = PdfExtractWorker(wayback_client, sink=None, thumbnail_sink=args.thumbnail_sink) + multi_worker = MultiprocessWrapper(worker, args.sink) + pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs) + else: + worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink) + pusher = JsonLinePusher(worker, args.json_file) + pusher.run() + +def run_extract_cdx(args): + wayback_client = WaybackClient() + if args.jobs > 1: + worker = PdfExtractWorker(wayback_client, sink=None) + multi_worker = MultiprocessWrapper(worker, args.sink) + pusher = CdxLinePusher( + multi_worker, + args.cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + batch_size=args.jobs, + ) + else: + worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink) + pusher = CdxLinePusher( + worker, + args.cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + ) + pusher.run() + +def run_extract_zipfile(args): + if args.jobs > 1: + print("multi-processing: {}".format(args.jobs), file=sys.stderr) + worker = PdfExtractBlobWorker(sink=None, thumbnail_sink=None) + multi_worker = MultiprocessWrapper(worker, args.sink, jobs=args.jobs) + pusher = ZipfilePusher(multi_worker, args.zip_file, batch_size=args.jobs) + else: + worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=None) + pusher = ZipfilePusher(worker, args.zip_file) + pusher.run() + +def run_single(args): + worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=None) + with open(args.pdf_file, 'rb') as pdf_file: + result = worker.process(pdf_file.open()) + print(json.dumps(result, sort_keys=True)) + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--kafka-mode', + action='store_true', + help="send output to Kafka (not stdout)") + parser.add_argument('--kafka-hosts', + default="localhost:9092", + help="list of Kafka brokers (host/port) to use") + parser.add_argument('--kafka-env', + default="dev", + help="Kafka topic namespace to use (eg, prod, qa, dev)") + parser.add_argument('-j', '--jobs', + default=8, type=int, + help="parallelism for batch CPU jobs") + subparsers = parser.add_subparsers() + + sub_extract_json = subparsers.add_parser('extract-json', + help="for each JSON line with CDX info, fetches PDF and does PDF extraction") + sub_extract_json.set_defaults(func=run_extract_json) + sub_extract_json.add_argument('json_file', + help="JSON file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + + sub_extract_cdx = subparsers.add_parser('extract-cdx', + help="for each CDX line, fetches PDF and does PDF extraction") + sub_extract_cdx.set_defaults(func=run_extract_cdx) + sub_extract_cdx.add_argument('cdx_file', + help="CDX file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + + sub_extract_zipfile = subparsers.add_parser('extract-zipfile', + help="opens zipfile, iterates over PDF files inside and does PDF extract for each") + sub_extract_zipfile.set_defaults(func=run_extract_zipfile) + sub_extract_zipfile.add_argument('zip_file', + help="zipfile with PDFs to extract", + type=str) + + sub_single = subparsers.add_parser('single', + help="opens single PDF and extracts it") + sub_single.set_defaults(func=run_single) + sub_single.add_argument('pdf_file', + help="single PDF to extract", + type=str) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!", file=sys.stderr) + sys.exit(-1) + + args.text_sink = None + args.thumbnail_sink = None + if args.kafka_mode: + text_topic = "sandcrawler-{}.pdftext".format(args.kafka_env) + thumbnail_topic = "sandcrawler-{}.thumbnail-180px-jpeg".format(args.kafka_env) + args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, + produce_topic=text_topic) + args.thumbnail_sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, + produce_topic=thumbnail_topic) + print("Running in kafka output mode, publishing to {} and {}\n".format( + text_topic, thumbnail_topic), file=sys.stderr) + + args.func(args) + +if __name__ == '__main__': + main() diff --git a/python/sandcrawler/pdf.py b/python/sandcrawler/pdf.py deleted file mode 100644 index cfba679..0000000 --- a/python/sandcrawler/pdf.py +++ /dev/null @@ -1,167 +0,0 @@ - -import sys -import datetime -from io import BytesIO -from dataclasses import dataclass -from typing import Optional, Dict, Any - -import poppler -from PIL import Image - -from .workers import SandcrawlerWorker, SandcrawlerFetchWorker -from .misc import gen_file_metadata -from .ia import WaybackClient, WaybackError, PetaboxError - - -@dataclass -class PdfExtractResult: - sha1hex: str - status: str - error_msg: Optional[str] = None - file_meta: Optional[Dict[str,Any]] = None - text: Optional[str] = None - page0_thumbnail: Optional[bytes] = None - meta_xml: Optional[str] = None - pdf_info: Optional[Dict[str,Any]] = None - pdf_extra: Optional[Dict[str,Any]] = None - source: Optional[Dict[str,Any]] = None - - def to_pdftext_dict(self) -> dict: - """ - Outputs a JSON string as would be published to Kafka text/info topic. - """ - return { - 'sha1hex': self.sha1hex, - 'status': self.status, - 'file_meta': self.file_meta, - 'error_msg': self.error_msg, - 'text': self.text, - 'page0_thumbnail': self.page0_thumbnail is not None, - 'meta_xml': self.meta_xml, - 'pdf_info': self.pdf_info, - 'pdf_extra': self.pdf_extra, - 'source': self.source, - } - - -def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult: - file_meta = gen_file_metadata(blob) - sha1hex = file_meta['sha1hex'] - if file_meta['mimetype'] != 'application/pdf': - return PdfExtractResult( - sha1hex=sha1hex, - status='not-pdf', - error_msg=f"mimetype is '{file_meta['mimetype']}'", - file_meta=file_meta, - ) - - try: - pdf = poppler.load_from_data(blob) - page0 = pdf.create_page(0) - except NotImplementedError as e: - return PdfExtractResult( - sha1hex=sha1hex, - status='parse-error', - error_msg=str(e), - file_meta=file_meta, - ) - - page0_thumbnail: Optional[bytes] = None - renderer = poppler.PageRenderer() - try: - full_img = renderer.render_page(page0) - img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "RGBA", 0, 1) - img.thumbnail(thumb_size, Image.BICUBIC) - buf = BytesIO() - img.save(buf, thumb_type) - page0_thumbnail = buf.getvalue() - # assuming that very small images mean something went wrong - if page0_thumbnail is None or len(page0_thumbnail) < 50: - page0_thumbnail = None - except Exception as e: - print(str(e), file=sys.stderr) - page0_thumbnail = None - - page0rect = page0.page_rect() - full_text = page0.text() - for n in range(1, pdf.pages): - pageN = pdf.create_page(n) - full_text += pageN.text() - pdf_info = pdf.infos() - # TODO: is this actually needed? or does json marshalling work automatically? - for k in pdf_info.keys(): - if isinstance(pdf_info[k], datetime.datetime): - pdf_info[k] = datetime.datetime.isoformat(pdf_info[k]) - - return PdfExtractResult( - sha1hex=sha1hex, - file_meta=file_meta, - status='success', - error_msg=None, - text=full_text or None, - page0_thumbnail=page0_thumbnail, - meta_xml=pdf.metadata or None, - pdf_info=pdf.infos(), - pdf_extra=dict( - height=page0rect.height, - width=page0rect.width, - page_count=pdf.pages, - permanent_id=pdf.pdf_id.permanent_id, - update_id=pdf.pdf_id.update_id, - pdf_version=f"{pdf.pdf_version[0]}.{pdf.pdf_version[1]}", - ), - ) - -class PdfExtractWorker(SandcrawlerFetchWorker): - - def __init__(self, wayback_client=None, sink=None, **kwargs): - super().__init__(wayback_client=wayback_client) - self.wayback_client = wayback_client - self.sink = sink - self.thumbnail_sink = kwargs.get('thumbnail_sink') - - def timeout_response(self, task) -> Dict: - default_key = task['sha1hex'] - return dict( - status="error-timeout", - error_msg="internal GROBID worker timeout", - source=task, - sha1hex=default_key, - ) - - def process(self, record, key: Optional[str] = None): - default_key = record['sha1hex'] - - fetch_result = self.fetch_blob(record) - if fetch_result['status'] != 'success': - return fetch_result - blob = fetch_result['blob'] - - result = process_pdf(blob) - result.source = record - if self.thumbnail_sink and result.page0_thumbnail is not None: - self.thumbnail_sink.push_record(result.page0_thumbnail) - return result.to_pdftext_dict() - -class PdfExtractBlobWorker(SandcrawlerWorker): - """ - This is sort of like PdfExtractWorker, except it receives blobs directly, - instead of fetching blobs from some remote store. - """ - - def __init__(self, sink=None, **kwargs): - super().__init__() - self.sink = sink - self.thumbnail_sink = kwargs.get('thumbnail_sink') - - def process(self, blob, key: Optional[str] = None): - if not blob: - return None - assert isinstance(blob, bytes) - - result = process_pdf(blob) - if self.thumbnail_sink and result.page0_thumbnail is not None: - self.thumbnail_sink.push_record(result.page0_thumbnail) - - return result - diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py new file mode 100644 index 0000000..cfba679 --- /dev/null +++ b/python/sandcrawler/pdfextract.py @@ -0,0 +1,167 @@ + +import sys +import datetime +from io import BytesIO +from dataclasses import dataclass +from typing import Optional, Dict, Any + +import poppler +from PIL import Image + +from .workers import SandcrawlerWorker, SandcrawlerFetchWorker +from .misc import gen_file_metadata +from .ia import WaybackClient, WaybackError, PetaboxError + + +@dataclass +class PdfExtractResult: + sha1hex: str + status: str + error_msg: Optional[str] = None + file_meta: Optional[Dict[str,Any]] = None + text: Optional[str] = None + page0_thumbnail: Optional[bytes] = None + meta_xml: Optional[str] = None + pdf_info: Optional[Dict[str,Any]] = None + pdf_extra: Optional[Dict[str,Any]] = None + source: Optional[Dict[str,Any]] = None + + def to_pdftext_dict(self) -> dict: + """ + Outputs a JSON string as would be published to Kafka text/info topic. + """ + return { + 'sha1hex': self.sha1hex, + 'status': self.status, + 'file_meta': self.file_meta, + 'error_msg': self.error_msg, + 'text': self.text, + 'page0_thumbnail': self.page0_thumbnail is not None, + 'meta_xml': self.meta_xml, + 'pdf_info': self.pdf_info, + 'pdf_extra': self.pdf_extra, + 'source': self.source, + } + + +def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult: + file_meta = gen_file_metadata(blob) + sha1hex = file_meta['sha1hex'] + if file_meta['mimetype'] != 'application/pdf': + return PdfExtractResult( + sha1hex=sha1hex, + status='not-pdf', + error_msg=f"mimetype is '{file_meta['mimetype']}'", + file_meta=file_meta, + ) + + try: + pdf = poppler.load_from_data(blob) + page0 = pdf.create_page(0) + except NotImplementedError as e: + return PdfExtractResult( + sha1hex=sha1hex, + status='parse-error', + error_msg=str(e), + file_meta=file_meta, + ) + + page0_thumbnail: Optional[bytes] = None + renderer = poppler.PageRenderer() + try: + full_img = renderer.render_page(page0) + img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "RGBA", 0, 1) + img.thumbnail(thumb_size, Image.BICUBIC) + buf = BytesIO() + img.save(buf, thumb_type) + page0_thumbnail = buf.getvalue() + # assuming that very small images mean something went wrong + if page0_thumbnail is None or len(page0_thumbnail) < 50: + page0_thumbnail = None + except Exception as e: + print(str(e), file=sys.stderr) + page0_thumbnail = None + + page0rect = page0.page_rect() + full_text = page0.text() + for n in range(1, pdf.pages): + pageN = pdf.create_page(n) + full_text += pageN.text() + pdf_info = pdf.infos() + # TODO: is this actually needed? or does json marshalling work automatically? + for k in pdf_info.keys(): + if isinstance(pdf_info[k], datetime.datetime): + pdf_info[k] = datetime.datetime.isoformat(pdf_info[k]) + + return PdfExtractResult( + sha1hex=sha1hex, + file_meta=file_meta, + status='success', + error_msg=None, + text=full_text or None, + page0_thumbnail=page0_thumbnail, + meta_xml=pdf.metadata or None, + pdf_info=pdf.infos(), + pdf_extra=dict( + height=page0rect.height, + width=page0rect.width, + page_count=pdf.pages, + permanent_id=pdf.pdf_id.permanent_id, + update_id=pdf.pdf_id.update_id, + pdf_version=f"{pdf.pdf_version[0]}.{pdf.pdf_version[1]}", + ), + ) + +class PdfExtractWorker(SandcrawlerFetchWorker): + + def __init__(self, wayback_client=None, sink=None, **kwargs): + super().__init__(wayback_client=wayback_client) + self.wayback_client = wayback_client + self.sink = sink + self.thumbnail_sink = kwargs.get('thumbnail_sink') + + def timeout_response(self, task) -> Dict: + default_key = task['sha1hex'] + return dict( + status="error-timeout", + error_msg="internal GROBID worker timeout", + source=task, + sha1hex=default_key, + ) + + def process(self, record, key: Optional[str] = None): + default_key = record['sha1hex'] + + fetch_result = self.fetch_blob(record) + if fetch_result['status'] != 'success': + return fetch_result + blob = fetch_result['blob'] + + result = process_pdf(blob) + result.source = record + if self.thumbnail_sink and result.page0_thumbnail is not None: + self.thumbnail_sink.push_record(result.page0_thumbnail) + return result.to_pdftext_dict() + +class PdfExtractBlobWorker(SandcrawlerWorker): + """ + This is sort of like PdfExtractWorker, except it receives blobs directly, + instead of fetching blobs from some remote store. + """ + + def __init__(self, sink=None, **kwargs): + super().__init__() + self.sink = sink + self.thumbnail_sink = kwargs.get('thumbnail_sink') + + def process(self, blob, key: Optional[str] = None): + if not blob: + return None + assert isinstance(blob, bytes) + + result = process_pdf(blob) + if self.thumbnail_sink and result.page0_thumbnail is not None: + self.thumbnail_sink.push_record(result.page0_thumbnail) + + return result + diff --git a/python/tests/test_pdf.py b/python/tests/test_pdf.py deleted file mode 100644 index 1ccf85c..0000000 --- a/python/tests/test_pdf.py +++ /dev/null @@ -1,61 +0,0 @@ - -import pytest -import struct -import responses - -from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient -from sandcrawler.pdf import process_pdf -from test_wayback import wayback_client, cdx_client - - -FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) - -def test_process_fake_pdf(): - resp = process_pdf(FAKE_PDF_BYTES) - print(resp) - assert resp.status == "not-pdf" - -def test_process_dummy_pdf(): - with open('tests/files/dummy.pdf', 'rb') as f: - pdf_bytes = f.read() - resp = process_pdf(pdf_bytes) - assert resp.status == 'success' - assert resp.page0_thumbnail is not None - assert len(resp.text) > 10 - assert resp.meta_xml is None - assert resp.file_meta['mimetype'] == 'application/pdf' - print(resp.pdf_info) - print(resp.pdf_extra) - assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis" - # 595 x 842 - assert resp.pdf_extra['height'] == 842 - assert resp.pdf_extra['width'] == 595 - assert resp.pdf_extra['page_count'] == 1 - -def test_pdfextract_worker_cdx(wayback_client): - - sink = BlackholeSink() - worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) - - with open('tests/files/example.cdx', 'r') as cdx_file: - pusher = CdxLinePusher( - worker, - cdx_file, - filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], - ) - pusher_counts = pusher.run() - assert pusher_counts['total'] - assert pusher_counts['pushed'] == 7 - assert pusher_counts['pushed'] == worker.counts['total'] - -def test_pdfextract_blob_worker(): - - sink = BlackholeSink() - worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink) - - with open('tests/files/dummy.pdf', 'rb') as f: - pdf_bytes = f.read() - - worker.process(pdf_bytes) - diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py new file mode 100644 index 0000000..1ccf85c --- /dev/null +++ b/python/tests/test_pdfextract.py @@ -0,0 +1,61 @@ + +import pytest +import struct +import responses + +from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient +from sandcrawler.pdf import process_pdf +from test_wayback import wayback_client, cdx_client + + +FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) + +def test_process_fake_pdf(): + resp = process_pdf(FAKE_PDF_BYTES) + print(resp) + assert resp.status == "not-pdf" + +def test_process_dummy_pdf(): + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + resp = process_pdf(pdf_bytes) + assert resp.status == 'success' + assert resp.page0_thumbnail is not None + assert len(resp.text) > 10 + assert resp.meta_xml is None + assert resp.file_meta['mimetype'] == 'application/pdf' + print(resp.pdf_info) + print(resp.pdf_extra) + assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis" + # 595 x 842 + assert resp.pdf_extra['height'] == 842 + assert resp.pdf_extra['width'] == 595 + assert resp.pdf_extra['page_count'] == 1 + +def test_pdfextract_worker_cdx(wayback_client): + + sink = BlackholeSink() + worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) + + with open('tests/files/example.cdx', 'r') as cdx_file: + pusher = CdxLinePusher( + worker, + cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + ) + pusher_counts = pusher.run() + assert pusher_counts['total'] + assert pusher_counts['pushed'] == 7 + assert pusher_counts['pushed'] == worker.counts['total'] + +def test_pdfextract_blob_worker(): + + sink = BlackholeSink() + worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink) + + with open('tests/files/dummy.pdf', 'rb') as f: + pdf_bytes = f.read() + + worker.process(pdf_bytes) + -- cgit v1.2.3