aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-06 10:12:25 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-06 10:12:25 -0700
commit5aa5683cabcf773d8eabc962afc79b1f4cc511fb (patch)
treec87f11307c013f8d4e9cdd6a228538b0e8f66eed /python
parentaf140307a025738767e740fea8da8d15e20fb983 (diff)
downloadsandcrawler-5aa5683cabcf773d8eabc962afc79b1f4cc511fb.tar.gz
sandcrawler-5aa5683cabcf773d8eabc962afc79b1f4cc511fb.zip
more pdfextract skip sha1hex
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdfextract.py21
1 files changed, 12 insertions, 9 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index be799bb..a6be786 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -18,21 +18,24 @@ from .ia import WaybackClient, WaybackError, PetaboxError
# poppler. For some reason, the usual Kafka timeout catcher isn't working on
# these, maybe due to threading.
BAD_PDF_SHA1HEX = [
+ "018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
+ "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
+ "0641822e68c5a07538b967489fd19a1d5dc371a5",
+ "09cba9b00494d12759c50cb914f1fb7c9746f5d1",
+ "09db7c9f2efb496c974427a61e84292ae27fc702",
+ "10c6577a658bf6203557e2998b25ea9788f8adfe",
+ "182749ad1db1d5e999d07f010bdcfc2978dadc88",
+ "20589d9dd0a22c8c938ad97b7f4f12648aa119fa",
+ "25ab9e6169f041be05844a9b4edd6574918af769",
"373f84dfab4ed47047826e604e2918a9cd6a95b2",
+ "445968ef735b228c08c3ff4238d99fc9f4824619",
+ "447fa6b5a90742a86429a932f6608d8e141688c0",
+ "4c81129904f7976a50825595a3497ea7b52579ef",
"64d821d728f9a3dc944b4c03be00feea0b57e314",
"88edcbab1cac2d70af5870422974afc253f4f0c6",
"8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
"b2b66b9c7f817a20144456f99c0be805602e8597",
"d6b0f405bf13c23d0e90c54eea527442786d1cd3",
- "445968ef735b228c08c3ff4238d99fc9f4824619",
- "09cba9b00494d12759c50cb914f1fb7c9746f5d1",
- "447fa6b5a90742a86429a932f6608d8e141688c0",
- "0641822e68c5a07538b967489fd19a1d5dc371a5",
- "09db7c9f2efb496c974427a61e84292ae27fc702",
- "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
- "018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
- "20589d9dd0a22c8c938ad97b7f4f12648aa119fa",
- "182749ad1db1d5e999d07f010bdcfc2978dadc88",
]
@dataclass