aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 23:06:36 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 23:06:36 -0700
commitfa25698620f54207423c3da40ae1bca567b598fa (patch)
treee18d90d9561f0ee590e23867be738f7e3e7495bf
parenta4de486bdd28b71562ae65d7ce6c8c741e5056b3 (diff)
downloadsandcrawler-fa25698620f54207423c3da40ae1bca567b598fa.tar.gz
sandcrawler-fa25698620f54207423c3da40ae1bca567b598fa.zip
pdfextract: attributerror with text extraction
-rw-r--r--python/sandcrawler/pdfextract.py16
1 files changed, 12 insertions, 4 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 98d6861..921c883 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -194,10 +194,18 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
print(str(e), file=sys.stderr)
page0_thumbnail = None
- full_text = page0.text()
- for n in range(1, pdf.pages):
- pageN = pdf.create_page(n)
- full_text += pageN.text()
+ try:
+ full_text = page0.text()
+ for n in range(1, pdf.pages):
+ pageN = pdf.create_page(n)
+ full_text += pageN.text()
+ except AttributeError as e:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='parse-error',
+ error_msg=str(e),
+ file_meta=file_meta,
+ )
# Kafka message size limit; cap at about 1 MByte
if len(full_text)> 1000000: