diff options
-rw-r--r-- | python/sandcrawler/pdfextract.py | 15 |
1 files changed, 4 insertions, 11 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 921c883..ed323b9 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -194,18 +194,11 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr print(str(e), file=sys.stderr) page0_thumbnail = None - try: - full_text = page0.text() - for n in range(1, pdf.pages): - pageN = pdf.create_page(n) + full_text = page0.text() + for n in range(1, pdf.pages): + pageN = pdf.create_page(n) + if pageN: full_text += pageN.text() - except AttributeError as e: - return PdfExtractResult( - sha1hex=sha1hex, - status='parse-error', - error_msg=str(e), - file_meta=file_meta, - ) # Kafka message size limit; cap at about 1 MByte if len(full_text)> 1000000: |