From b4a40d99b23a83eabeed490c0dce52dba31dc7b8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 21:23:08 -0700 Subject: fixes and tweaks from testing locally --- python/sandcrawler/pdfextract.py | 67 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) (limited to 'python/sandcrawler/pdfextract.py') diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 4e55f3f..5ef5dfd 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -1,5 +1,6 @@ import sys +import json import datetime from io import BytesIO from dataclasses import dataclass @@ -21,6 +22,7 @@ class PdfExtractResult: file_meta: Optional[Dict[str,Any]] = None text: Optional[str] = None page0_thumbnail: Optional[bytes] = None + has_page0_thumbnail: bool = False meta_xml: Optional[str] = None pdf_info: Optional[Dict[str,Any]] = None pdf_extra: Optional[Dict[str,Any]] = None @@ -31,18 +33,75 @@ class PdfExtractResult: Outputs a JSON string as would be published to Kafka text/info topic. """ return { + 'key': self.sha1hex, 'sha1hex': self.sha1hex, 'status': self.status, 'file_meta': self.file_meta, 'error_msg': self.error_msg, 'text': self.text, - 'page0_thumbnail': self.page0_thumbnail is not None, + 'has_page0_thumbnail': self.has_page0_thumbnail, 'meta_xml': self.meta_xml, 'pdf_info': self.pdf_info, 'pdf_extra': self.pdf_extra, 'source': self.source, } + @classmethod + def from_pdftext_dict(cls, record): + """ + Outputs a JSON string as would be published to Kafka text/info topic. + """ + if record['status'] != 'success': + return PdfExtractResult( + sha1hex=record['sha1hex'], + status=record['status'], + error_msg=record.get('error_msg'), + ) + else: + return PdfExtractResult( + sha1hex=record['sha1hex'], + status=record['status'], + file_meta=record.get('file_meta'), + text=record.get('text'), + has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)), + meta_xml=record.get('meta_xml'), + pdf_info=record.get('pdf_info'), + pdf_extra=record.get('pdf_extra'), + ) + + def to_sql_tuple(self) -> tuple: + # pdf_meta (sha1hex, updated, status, page0_thumbnail, page_count, + # word_count, page0_height, page0_width, permanent_id, pdf_created, + # pdf_version, metadata) + word_count: Optional[int] = None + if self.text: + word_count = len(self.text.split()) + metadata: Optional[Dict] = None + pdf_extra = self.pdf_extra or dict() + pdf_created = None + # TODO: form, encrypted + if self.pdf_info: + metadata = dict() + for k in ('Title', 'Subject', 'Author', 'Creator', 'Producer', 'doi'): + if k in self.pdf_info: + metadata[k.lower()] = self.pdf_info[k] + if 'CreationDate' in self.pdf_info: + pdf_created = self.pdf_info['CreationDate'] + return ( + self.sha1hex, + datetime.datetime.now(), # updated + self.status, + self.has_page0_thumbnail, + pdf_extra.get('page_count'), + word_count, + pdf_extra.get('page0_height'), + pdf_extra.get('page0_width'), + pdf_extra.get('permanent_id'), + pdf_created, + pdf_extra.get('pdf_version'), + metadata and json.dumps(metadata, sort_keys=True), + ) + def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult: """ @@ -70,6 +129,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr sha1hex=sha1hex, status='empty-pdf', file_meta=file_meta, + has_page0_thumbnail=False, ) page0 = pdf.create_page(0) if page0 is None: @@ -131,6 +191,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr status='success', error_msg=None, text=full_text or None, + has_page0_thumbnail=page0_thumbnail is not None, page0_thumbnail=page0_thumbnail, meta_xml=pdf.metadata or None, pdf_info=pdf_info, @@ -172,7 +233,7 @@ class PdfExtractWorker(SandcrawlerFetchWorker): result = process_pdf(blob) result.source = record if self.thumbnail_sink and result.page0_thumbnail is not None: - self.thumbnail_sink.push_record(result.page0_thumbnail) + self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex) return result.to_pdftext_dict() class PdfExtractBlobWorker(SandcrawlerWorker): @@ -193,7 +254,7 @@ class PdfExtractBlobWorker(SandcrawlerWorker): result = process_pdf(blob) if self.thumbnail_sink and result.page0_thumbnail is not None: - self.thumbnail_sink.push_record(result.page0_thumbnail) + self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex) return result -- cgit v1.2.3