aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-16 18:32:13 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-16 18:32:13 -0700
commite63f9df7d2c707bc1b2e0cfd109f9b2aaf29aa09 (patch)
treee910eac494f12cf9df60997650815c154b386026
parentd86a87f5000b97a2dc93c4a60ba4a18e834c9e0f (diff)
downloadsandcrawler-e63f9df7d2c707bc1b2e0cfd109f9b2aaf29aa09.tar.gz
sandcrawler-e63f9df7d2c707bc1b2e0cfd109f9b2aaf29aa09.zip
yet another broken PDF (sha1)
-rw-r--r--python/sandcrawler/pdfextract.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 06868a7..d417e1b 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -18,6 +18,7 @@ from .ia import WaybackClient, WaybackError, PetaboxError
# poppler. For some reason, the usual Kafka timeout catcher isn't working on
# these, maybe due to threading.
BAD_PDF_SHA1HEX = [
+ "011478a1e63a2a31eae1a93832a74cc95f220760",
"018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
"057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
"0641822e68c5a07538b967489fd19a1d5dc371a5",