From 5aa5683cabcf773d8eabc962afc79b1f4cc511fb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 6 Aug 2020 10:12:25 -0700 Subject: more pdfextract skip sha1hex --- python/sandcrawler/pdfextract.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index be799bb..a6be786 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -18,21 +18,24 @@ from .ia import WaybackClient, WaybackError, PetaboxError # poppler. For some reason, the usual Kafka timeout catcher isn't working on # these, maybe due to threading. BAD_PDF_SHA1HEX = [ + "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", + "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", + "0641822e68c5a07538b967489fd19a1d5dc371a5", + "09cba9b00494d12759c50cb914f1fb7c9746f5d1", + "09db7c9f2efb496c974427a61e84292ae27fc702", + "10c6577a658bf6203557e2998b25ea9788f8adfe", + "182749ad1db1d5e999d07f010bdcfc2978dadc88", + "20589d9dd0a22c8c938ad97b7f4f12648aa119fa", + "25ab9e6169f041be05844a9b4edd6574918af769", "373f84dfab4ed47047826e604e2918a9cd6a95b2", + "445968ef735b228c08c3ff4238d99fc9f4824619", + "447fa6b5a90742a86429a932f6608d8e141688c0", + "4c81129904f7976a50825595a3497ea7b52579ef", "64d821d728f9a3dc944b4c03be00feea0b57e314", "88edcbab1cac2d70af5870422974afc253f4f0c6", "8e4f03c29ae1fe7227140ab4b625f375f6c00d31", "b2b66b9c7f817a20144456f99c0be805602e8597", "d6b0f405bf13c23d0e90c54eea527442786d1cd3", - "445968ef735b228c08c3ff4238d99fc9f4824619", - "09cba9b00494d12759c50cb914f1fb7c9746f5d1", - "447fa6b5a90742a86429a932f6608d8e141688c0", - "0641822e68c5a07538b967489fd19a1d5dc371a5", - "09db7c9f2efb496c974427a61e84292ae27fc702", - "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", - "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", - "20589d9dd0a22c8c938ad97b7f4f12648aa119fa", - "182749ad1db1d5e999d07f010bdcfc2978dadc88", ] @dataclass -- cgit v1.2.3