diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 23:05:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 23:05:06 -0700 |
commit | a4de486bdd28b71562ae65d7ce6c8c741e5056b3 (patch) | |
tree | bb82d693cd249ebbce20db8a667c10e551ad79b1 /python | |
parent | 615914b13e5a67bcf749cdc682c86c83c1e7bacc (diff) | |
download | sandcrawler-a4de486bdd28b71562ae65d7ce6c8c741e5056b3.tar.gz sandcrawler-a4de486bdd28b71562ae65d7ce6c8c741e5056b3.zip |
catch UnicodeDecodeError in pdfextract
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/pdfextract.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index d1b060e..98d6861 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -215,7 +215,16 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr file_meta=file_meta, ) - pdf_info = pdf.infos() + try: + pdf_info = pdf.infos() + except UnicodeDecodeError: + return PdfExtractResult( + sha1hex=sha1hex, + status='bad-unicode', + error_msg="in infos()", + file_meta=file_meta, + ) + # TODO: is this actually needed? or does json marshalling work automatically? for k in pdf_info.keys(): if isinstance(pdf_info[k], datetime.datetime): |