diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 15:21:34 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 15:21:37 -0700 |
commit | 2f4b35f29f53b0e643c3e7cd74e63370758dc490 (patch) | |
tree | 685962293debf2d91326cbef281f1d3cb717ef4e | |
parent | 800860ecd25346ff4a638e9d42fa905396b8fa1b (diff) | |
download | sandcrawler-2f4b35f29f53b0e643c3e7cd74e63370758dc490.tar.gz sandcrawler-2f4b35f29f53b0e643c3e7cd74e63370758dc490.zip |
hack to unblock thumbnail processing pipeline
Some PDFs taking 10+ minutes to process, causing kafka exceptions and
consumer churn. Not sure why kafka json pusher timeouts are not catching
these.
-rw-r--r-- | python/sandcrawler/pdfextract.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index ac5f6ac..c77a3f0 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -14,6 +14,14 @@ from .misc import gen_file_metadata from .ia import WaybackClient, WaybackError, PetaboxError +# This is a hack to work around timeouts when processing certain PDFs with +# poppler. For some reason, the usual Kafka timeout catcher isn't working on +# these, maybe due to threading. +BAD_PDF_SHA1HEX = [ + "373f84dfab4ed47047826e604e2918a9cd6a95b2", + "88edcbab1cac2d70af5870422974afc253f4f0c6", +] + @dataclass class PdfExtractResult: sha1hex: str @@ -149,6 +157,14 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr file_meta=file_meta, ) + if sha1hex in BAD_PDF_SHA1HEX: + return PdfExtractResult( + sha1hex=sha1hex, + status='bad-pdf', + error_msg=f"PDF known to cause processing issues", + file_meta=file_meta, + ) + try: pdf = poppler.load_from_data(blob) if pdf is None: |