diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-16 18:32:13 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-16 18:32:13 -0700 |
commit | e63f9df7d2c707bc1b2e0cfd109f9b2aaf29aa09 (patch) | |
tree | e910eac494f12cf9df60997650815c154b386026 | |
parent | d86a87f5000b97a2dc93c4a60ba4a18e834c9e0f (diff) | |
download | sandcrawler-e63f9df7d2c707bc1b2e0cfd109f9b2aaf29aa09.tar.gz sandcrawler-e63f9df7d2c707bc1b2e0cfd109f9b2aaf29aa09.zip |
yet another broken PDF (sha1)
-rw-r--r-- | python/sandcrawler/pdfextract.py | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 06868a7..d417e1b 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -18,6 +18,7 @@ from .ia import WaybackClient, WaybackError, PetaboxError # poppler. For some reason, the usual Kafka timeout catcher isn't working on # these, maybe due to threading. BAD_PDF_SHA1HEX = [ + "011478a1e63a2a31eae1a93832a74cc95f220760", "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", "0641822e68c5a07538b967489fd19a1d5dc371a5", |