diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-16 17:28:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-16 17:28:36 -0700 |
commit | 5c32007e23a4f3b6902b760b5e06e4dd341918b3 (patch) | |
tree | 86fe446ef6f980d09fa95867ddb0bae847cc2765 /python | |
parent | d49ea4fb3f567351c63816e703348d8a9fd49ff0 (diff) | |
download | sandcrawler-5c32007e23a4f3b6902b760b5e06e4dd341918b3.tar.gz sandcrawler-5c32007e23a4f3b6902b760b5e06e4dd341918b3.zip |
initial work on PDF extraction worker
This worker fetches full PDFs, then extracts thumbnails, raw text, and
PDF metadata. Similar to GROBID worker.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/pdf.py | 157 |
2 files changed, 158 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 654df35..2e5efd7 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -7,4 +7,4 @@ from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePage from .ingest import IngestFileWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient - +from .pdf import PdfExtractWorker diff --git a/python/sandcrawler/pdf.py b/python/sandcrawler/pdf.py new file mode 100644 index 0000000..fb72dfe --- /dev/null +++ b/python/sandcrawler/pdf.py @@ -0,0 +1,157 @@ + +import sys +from io import BytesIO +from dataclasses import dataclass +from typing import Optional, Dict, Any + +import poppler +from PIL import Image + +from .workers import SandcrawlerWorker, SandcrawlerFetchWorker +from .misc import gen_file_metadata +from .ia import WaybackClient, WaybackError, PetaboxError + + +@dataclass +class PdfExtractResult: + sha1hex: str + status: str + error_msg: Optional[str] + file_meta: Optional[Dict[str,Any]] + text: Optional[str] + page0_thumbnail: Optional[bytes] + meta_xml: Optional[str] + pdf_info: Optional[Dict[str,Any]] + pdf_extra: Optional[Dict[str,Any]] + + def to_text_dict(self) -> dict: + """ + Outputs a JSON string as would be published to Kafka text/info topic. + """ + return { + 'sha1hex': self.sha1hex, + 'status': self.status, + 'file_meta': self.file_meta, + 'error_msg': self.error_msg, + 'text': self.text, + 'page0_thumbnail': self.page0_thumbnail is not None, + 'meta_xml': self.meta_xml, + 'pdf_info': self.pdf_info, + 'pdf_extra': self.pdf_extra, + } + + +def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult: + file_meta = gen_file_metadata(blob) + sha1hex = file_meta['sha1hex'] + if file_meta['mimetype'] != 'application/pdf': + return PdfExtractResult( + sha1hex=sha1hex, + status='not-pdf', + error_msg=f"mimetype is '{file_meta['mimetype']}'", + file_meta=file_meta, + ) + + try: + pdf = poppler.load_from_data(blob) + page0 = pdf.create_page(0) + except NotImplementedError as e: + return PdfExtractResult( + sha1hex=sha1hex, + status='parse-error', + error_msg=str(e), + file_meta=file_meta, + ) + + page0_thumbnail: Optional[bytes] = None + renderer = poppler.PageRenderer() + try: + full_img = renderer.render_page(page0) + img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "RGBA", 0, 1) + img.thumbnail(thumb_size, Image.BICUBIC) + buf = BytesIO() + img.save(buf, thumb_type) + page0_thumbnail = buf.bytes.getvalue() + # assuming that very small images mean something went wrong + if len(page0_thumbnail) < 50: + page0_thumbnail = None + except Exception as e: + print(str(e), file=sys.stderr) + page0_thumbnail = None + + page0rect = page0.page_rect() + full_text = page0.text() + for n in range(1, pdf.pages): + pageN = pdf.create_page(n) + full_text += pageN.text() + pdf_info = pdf.infos() + # Is this actually needed? or does json marshalling work automatically? + #for k in pdf_info.keys(): + # if isinstance(pdf_info[k], datetime.datetime): + # pdf_info[k] = datetime.datetime.isoformat(pdf_info[k]) + + return PdfExtractResult( + sha1hex=sha1hex, + file_meta=file_meta, + status='success', + error_msg=None, + text=full_text, + page0_thumbnail=page0_thumbnail, + meta_xml=pdf.metadata, + pdf_info=pdf.infos(), + pdf_extra=dict( + height=page0rect.height, + width=page0rect.width, + permanent_id=pdf.pdf_id.permanent_id, + update_id=pdf.pdf_id.update_id, + pdf_version=f"{pdf.pdf_version[0]}.{pdf.pdf_version[1]}", + ), + ) + +class PdfExtractWorker(SandcrawlerFetchWorker): + + def __init__(self, wayback_client=None, sink=None, **kwargs): + super().__init__(wayback_client=wayback_client) + self.wayback_client = wayback_client + self.sink = sink + self.thumbnail_sink = kwargs.get('thumbnail_sink') + + def timeout_response(self, task): + default_key = task['sha1hex'] + return dict( + status="error-timeout", + error_msg="internal GROBID worker timeout", + source=task, + sha1hex=default_key, + ) + + def process(self, record) -> dict: + default_key = record['sha1hex'] + + fetch_result = self.fetch_blob(record) + if fetch_result['status'] != 'success': + return fetch_result + blob = fetch_result['blob'] + + result = process_pdf(blob) + result.source = record + if self.thumbnail_sink and result.page0_thumbnail is not None: + self.thumbnail_sink.push_record(result.page0_thumbnail) + return result.to_thing() + +class PdfExtractBlobWorker(SandcrawlerWorker): + """ + This is sort of like PdfExtractWorker, except it receives blobs directly, + instead of fetching blobs from some remote store. + """ + + def __init__(self, sink=None, **kwargs): + super().__init__() + self.sink = sink + + def process(self, blob): + if not blob: + return None + result = process_pdf(blob) + return result + |