aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-09-14 12:00:59 -0700
committerBryan Newbold <bnewbold@archive.org>2022-09-14 12:01:01 -0700
commit2c19b7180e83c70411516c63b8dced5429b450f4 (patch)
tree8f7e4c9237f9bd9c84af9d226aa0a48eb49b7c3d /python
parenta283b054dc98620046dff28cbb16663564b8320b (diff)
downloadsandcrawler-2c19b7180e83c70411516c63b8dced5429b450f4.tar.gz
sandcrawler-2c19b7180e83c70411516c63b8dced5429b450f4.zip
catch poppler 'ValueError' when parsing PDFs
Seeing a spike in bad PDFs in the past week or so, while processing old failed ingests. Should really switch from poppler to muPDF.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdfextract.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 2441891..5f2dc1a 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -340,7 +340,8 @@ def process_pdf(
)
# this call sometimes fails an returns an AttributeError
page0rect = page0.page_rect()
- except (AttributeError, poppler.document.LockedDocumentError) as e:
+ # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch
+ except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e:
# may need to expand the set of exceptions caught here over time, but
# starting with a narrow set
return PdfExtractResult(