aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-10 12:19:37 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-10 12:19:37 -0700
commit41fdef2a3768bec1cdbde21ab72fcbbe44c180c7 (patch)
tree68afe816cc031a2d650aba6ff233d93a686d9cc6
parent9753876b85c767a9848467065b4d4dd613d5ed68 (diff)
downloadsandcrawler-41fdef2a3768bec1cdbde21ab72fcbbe44c180c7.tar.gz
sandcrawler-41fdef2a3768bec1cdbde21ab72fcbbe44c180c7.zip
yet more bad sha1 PDFs to skip
-rw-r--r--python/sandcrawler/pdfextract.py20
1 files changed, 20 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 5d5f6f1..58199c8 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -26,25 +26,36 @@ BAD_PDF_SHA1HEX = [
"09db7c9f2efb496c974427a61e84292ae27fc702",
"0d1c1567ea70e7b922ba88ccb868ffc7ca18e75c",
"10c6577a658bf6203557e2998b25ea9788f8adfe",
+ "17e679b0ec9444fff2ea4d02caec05dd2de80ec3",
"182749ad1db1d5e999d07f010bdcfc2978dadc88",
"20589d9dd0a22c8c938ad97b7f4f12648aa119fa",
"25ab9e6169f041be05844a9b4edd6574918af769",
"281de904c4642a9be4f17b9774fc0a2bdc8a90e3",
"2fc64da736175810918fd32c94c5068b0d660bcc",
"32318fba9b05b2756b7362bcaa4722c92ed8d449",
+ "336833c6fc968cd0938250dfc93c032a30111cfc",
"373f84dfab4ed47047826e604e2918a9cd6a95b2",
"3ac0b6e17e30d141871a0a5b127536919fe5aa19",
+ "3c8a6a708da0dc1802f5f3e5267a49b3c25e1ffe",
+ "3e5f9fb94e7314447a22f3d009419a922136177f",
"436c9183724f051b22c96285aa8ff1d2ba709574",
"445968ef735b228c08c3ff4238d99fc9f4824619",
"447fa6b5a90742a86429a932f6608d8e141688c0",
+ "4785181cec8944eee00ddb631a5dfc771b89bab7",
+ "47db2db2cc976429568841a0496c0ab4ed7b5977",
"4c81129904f7976a50825595a3497ea7b52579ef",
"50b3c5a3122272aca69855ef06b85d0b43a76eb1",
"58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
+ "5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
"5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
"646c4a654270606256397684204ff0f3d17be2e7",
"64d821d728f9a3dc944b4c03be00feea0b57e314",
"6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
"771f1ca0007a6fbed5b4a434c73f524f715d33c1",
+ "781dafda896a9f5c30f3d0a011f79a3b79b574c4",
+ "788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
+ "7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
"88edcbab1cac2d70af5870422974afc253f4f0c6",
"8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
"949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
@@ -55,15 +66,24 @@ BAD_PDF_SHA1HEX = [
"b2d719120306b90eb8dd3580b699a61ec70556f4",
"b5be7f409a3a2601208c5ce08cf52b9ac1094aae",
"b5bf8b7467fb095c90adf3b49aa1687291e4469c",
+ "b8b427e5b3d650ba9e03197f9c3917e25b878930",
+ "bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
"c1b583fbd052572f08158d39ffe4d7510dadbebb",
+ "c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
+ "c7687fa6f637c7d32a25be0e772867d87536d35c",
+ "c92b9ae9eefa07504950b405625aef54b48f0e1a",
"ccb1debcfae006a3fc984e9e91309b9706a5c375",
"cd8a7c3b8d850ebedc1ca791ccb37b9a2689f9c3",
"d17b1e254cce82df5c6eb4fd492cef91e7e11558",
"d188762a7e3ab5d4ee8a897204316513e4e636ec",
"d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "da2211ee2dbc6dda36571976d810e2366a3d2504",
"e01bb7256d77aea258313bb410dfcfc10512f420",
+ "e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
+ "e9d7716b4f94bbc3d94459b5fe9bb8b15cb2e433",
"eb1b39fd7a874896688855a22efddef10272427c",
"eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
"f68f9a9202a75d2aee35252e104d796f9515001e",
]