From fa25698620f54207423c3da40ae1bca567b598fa Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Thu, 25 Jun 2020 23:06:36 -0700
Subject: pdfextract: attributerror with text extraction

---
 python/sandcrawler/pdfextract.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 98d6861..921c883 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -194,10 +194,18 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
         print(str(e), file=sys.stderr)
         page0_thumbnail = None
 
-    full_text = page0.text()
-    for n in range(1, pdf.pages):
-        pageN = pdf.create_page(n)
-        full_text += pageN.text()
+    try:
+        full_text = page0.text()
+        for n in range(1, pdf.pages):
+            pageN = pdf.create_page(n)
+            full_text += pageN.text()
+    except AttributeError as e:
+        return PdfExtractResult(
+            sha1hex=sha1hex,
+            status='parse-error',
+            error_msg=str(e),
+            file_meta=file_meta,
+        )
 
     # Kafka message size limit; cap at about 1 MByte
     if len(full_text)> 1000000:
-- 
cgit v1.2.3