hack to unblock thumbnail processing pipeline

Some PDFs taking 10+ minutes to process, causing kafka exceptions and consumer churn. Not sure why kafka json pusher timeouts are not catching these.
author: Bryan Newbold <bnewbold@archive.org> 2020-06-29 15:21:34 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-06-29 15:21:37 -0700
commit: 2f4b35f29f53b0e643c3e7cd74e63370758dc490 (patch)
tree: 685962293debf2d91326cbef281f1d3cb717ef4e
parent: 800860ecd25346ff4a638e9d42fa905396b8fa1b (diff)
download: sandcrawler-2f4b35f29f53b0e643c3e7cd74e63370758dc490.tar.gz
sandcrawler-2f4b35f29f53b0e643c3e7cd74e63370758dc490.zip
1 files changed, 16 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index ac5f6ac..c77a3f0 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -14,6 +14,14 @@ from .misc import gen_file_metadata
 from .ia import WaybackClient, WaybackError, PetaboxError
 
 
+# This is a hack to work around timeouts when processing certain PDFs with
+# poppler. For some reason, the usual Kafka timeout catcher isn't working on
+# these, maybe due to threading.
+BAD_PDF_SHA1HEX = [
+    "373f84dfab4ed47047826e604e2918a9cd6a95b2",
+    "88edcbab1cac2d70af5870422974afc253f4f0c6",
+]
+
 @dataclass
 class PdfExtractResult:
     sha1hex: str
@@ -149,6 +157,14 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
             file_meta=file_meta,
         )
 
+    if sha1hex in BAD_PDF_SHA1HEX:
+        return PdfExtractResult(
+            sha1hex=sha1hex,
+            status='bad-pdf',
+            error_msg=f"PDF known to cause processing issues",
+            file_meta=file_meta,
+        )
+
     try:
         pdf = poppler.load_from_data(blob)
         if pdf is None:
author	Bryan Newbold <bnewbold@archive.org>	2020-06-29 15:21:34 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-06-29 15:21:37 -0700
commit	2f4b35f29f53b0e643c3e7cd74e63370758dc490 (patch)
tree	685962293debf2d91326cbef281f1d3cb717ef4e
parent	800860ecd25346ff4a638e9d42fa905396b8fa1b (diff)
download	sandcrawler-2f4b35f29f53b0e643c3e7cd74e63370758dc490.tar.gz sandcrawler-2f4b35f29f53b0e643c3e7cd74e63370758dc490.zip