From fa25698620f54207423c3da40ae1bca567b598fa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 25 Jun 2020 23:06:36 -0700 Subject: pdfextract: attributerror with text extraction --- python/sandcrawler/pdfextract.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 98d6861..921c883 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -194,10 +194,18 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr print(str(e), file=sys.stderr) page0_thumbnail = None - full_text = page0.text() - for n in range(1, pdf.pages): - pageN = pdf.create_page(n) - full_text += pageN.text() + try: + full_text = page0.text() + for n in range(1, pdf.pages): + pageN = pdf.create_page(n) + full_text += pageN.text() + except AttributeError as e: + return PdfExtractResult( + sha1hex=sha1hex, + status='parse-error', + error_msg=str(e), + file_meta=file_meta, + ) # Kafka message size limit; cap at about 1 MByte if len(full_text)> 1000000: -- cgit v1.2.3