From 2c19b7180e83c70411516c63b8dced5429b450f4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 14 Sep 2022 12:00:59 -0700 Subject: catch poppler 'ValueError' when parsing PDFs Seeing a spike in bad PDFs in the past week or so, while processing old failed ingests. Should really switch from poppler to muPDF. --- python/sandcrawler/pdfextract.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 2441891..5f2dc1a 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -340,7 +340,8 @@ def process_pdf( ) # this call sometimes fails an returns an AttributeError page0rect = page0.page_rect() - except (AttributeError, poppler.document.LockedDocumentError) as e: + # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch + except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e: # may need to expand the set of exceptions caught here over time, but # starting with a narrow set return PdfExtractResult( -- cgit v1.2.3