aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 23:05:06 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 23:05:06 -0700
commita4de486bdd28b71562ae65d7ce6c8c741e5056b3 (patch)
treebb82d693cd249ebbce20db8a667c10e551ad79b1 /python
parent615914b13e5a67bcf749cdc682c86c83c1e7bacc (diff)
downloadsandcrawler-a4de486bdd28b71562ae65d7ce6c8c741e5056b3.tar.gz
sandcrawler-a4de486bdd28b71562ae65d7ce6c8c741e5056b3.zip
catch UnicodeDecodeError in pdfextract
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdfextract.py11
1 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index d1b060e..98d6861 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -215,7 +215,16 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
file_meta=file_meta,
)
- pdf_info = pdf.infos()
+ try:
+ pdf_info = pdf.infos()
+ except UnicodeDecodeError:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='bad-unicode',
+ error_msg="in infos()",
+ file_meta=file_meta,
+ )
+
# TODO: is this actually needed? or does json marshalling work automatically?
for k in pdf_info.keys():
if isinstance(pdf_info[k], datetime.datetime):