diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-06 10:12:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-06 10:12:25 -0700 |
commit | 5aa5683cabcf773d8eabc962afc79b1f4cc511fb (patch) | |
tree | c87f11307c013f8d4e9cdd6a228538b0e8f66eed | |
parent | af140307a025738767e740fea8da8d15e20fb983 (diff) | |
download | sandcrawler-5aa5683cabcf773d8eabc962afc79b1f4cc511fb.tar.gz sandcrawler-5aa5683cabcf773d8eabc962afc79b1f4cc511fb.zip |
more pdfextract skip sha1hex
-rw-r--r-- | python/sandcrawler/pdfextract.py | 21 |
1 files changed, 12 insertions, 9 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index be799bb..a6be786 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -18,21 +18,24 @@ from .ia import WaybackClient, WaybackError, PetaboxError # poppler. For some reason, the usual Kafka timeout catcher isn't working on # these, maybe due to threading. BAD_PDF_SHA1HEX = [ + "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", + "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", + "0641822e68c5a07538b967489fd19a1d5dc371a5", + "09cba9b00494d12759c50cb914f1fb7c9746f5d1", + "09db7c9f2efb496c974427a61e84292ae27fc702", + "10c6577a658bf6203557e2998b25ea9788f8adfe", + "182749ad1db1d5e999d07f010bdcfc2978dadc88", + "20589d9dd0a22c8c938ad97b7f4f12648aa119fa", + "25ab9e6169f041be05844a9b4edd6574918af769", "373f84dfab4ed47047826e604e2918a9cd6a95b2", + "445968ef735b228c08c3ff4238d99fc9f4824619", + "447fa6b5a90742a86429a932f6608d8e141688c0", + "4c81129904f7976a50825595a3497ea7b52579ef", "64d821d728f9a3dc944b4c03be00feea0b57e314", "88edcbab1cac2d70af5870422974afc253f4f0c6", "8e4f03c29ae1fe7227140ab4b625f375f6c00d31", "b2b66b9c7f817a20144456f99c0be805602e8597", "d6b0f405bf13c23d0e90c54eea527442786d1cd3", - "445968ef735b228c08c3ff4238d99fc9f4824619", - "09cba9b00494d12759c50cb914f1fb7c9746f5d1", - "447fa6b5a90742a86429a932f6608d8e141688c0", - "0641822e68c5a07538b967489fd19a1d5dc371a5", - "09db7c9f2efb496c974427a61e84292ae27fc702", - "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", - "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", - "20589d9dd0a22c8c938ad97b7f4f12648aa119fa", - "182749ad1db1d5e999d07f010bdcfc2978dadc88", ] @dataclass |