diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 23:09:45 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 23:09:45 -0700 |
commit | 19524304c40d8bbddc6252f33a592b5713d7799f (patch) | |
tree | e18d90d9561f0ee590e23867be738f7e3e7495bf | |
parent | 254f24ad6566c9d4b5814868911b604802847b58 (diff) | |
download | sandcrawler-19524304c40d8bbddc6252f33a592b5713d7799f.tar.gz sandcrawler-19524304c40d8bbddc6252f33a592b5713d7799f.zip |
Revert "simpler handling of null PDF text pages"
This reverts commit 254f24ad6566c9d4b5814868911b604802847b58.
Attribute was actually internal to text() call, not a None page.
-rw-r--r-- | python/sandcrawler/pdfextract.py | 15 |
1 files changed, 11 insertions, 4 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index ed323b9..921c883 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -194,11 +194,18 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr print(str(e), file=sys.stderr) page0_thumbnail = None - full_text = page0.text() - for n in range(1, pdf.pages): - pageN = pdf.create_page(n) - if pageN: + try: + full_text = page0.text() + for n in range(1, pdf.pages): + pageN = pdf.create_page(n) full_text += pageN.text() + except AttributeError as e: + return PdfExtractResult( + sha1hex=sha1hex, + status='parse-error', + error_msg=str(e), + file_meta=file_meta, + ) # Kafka message size limit; cap at about 1 MByte if len(full_text)> 1000000: |