From e63f9df7d2c707bc1b2e0cfd109f9b2aaf29aa09 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 16 Sep 2020 18:32:13 -0700 Subject: yet another broken PDF (sha1) --- python/sandcrawler/pdfextract.py | 1 + 1 file changed, 1 insertion(+) (limited to 'python') diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 06868a7..d417e1b 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -18,6 +18,7 @@ from .ia import WaybackClient, WaybackError, PetaboxError # poppler. For some reason, the usual Kafka timeout catcher isn't working on # these, maybe due to threading. BAD_PDF_SHA1HEX = [ + "011478a1e63a2a31eae1a93832a74cc95f220760", "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", "0641822e68c5a07538b967489fd19a1d5dc371a5", -- cgit v1.2.3