aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/pdfextract.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/pdfextract.py')
-rw-r--r--python/sandcrawler/pdfextract.py20
1 files changed, 19 insertions, 1 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 6c18395..97d338e 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -69,35 +69,45 @@ BAD_PDF_SHA1HEX: List[str] = [
"58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
"59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0",
"5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
+ "5c5b45c85eff07d4302844e00ec8baa57b988c60",
"5e04779cbbae5ce88bb786064f756885dd6895fe",
"5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "62247fe6b8d3ca50477cafddbe24bf63832d6674",
"623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
"646c4a654270606256397684204ff0f3d17be2e7",
"64d821d728f9a3dc944b4c03be00feea0b57e314",
+ "668b7d777203af4b261d21bf4669fc9b385062e1",
"689b5cb3ddef213d612363a903f10d0358ea64d2",
"6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
"74e617dc95555e8ca3aadd19d0c85b71cd77d1d9",
+ "7596438d77444a7c4228bb96fa4b394ba7d7e23b",
"75c2662a96ccc48891228df7c85eb7d4da9dd621",
"771f1ca0007a6fbed5b4a434c73f524f715d33c1",
"776859635e9dc01d97b0582f49c814ffbcb019fb",
"781dafda896a9f5c30f3d0a011f79a3b79b574c4",
"788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
"79d6cba3c6e577a0f3a3a9fe575680d38454938d",
+ "7b8b7e8e4b789579a7d2fda329db52528383a652",
+ "7c5c925cfb7c5a861b5c0a1d923308f9bedd335e",
"7cfc0739be9c49d94272110a0a748256bdde9be6",
"7daf61526ec825151f384cc1db510ca5237d5d80",
"7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
+ "800e47a7ed214f7acac85cc29aa7b0f9c0e218ae",
"8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67",
"859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad",
"88edcbab1cac2d70af5870422974afc253f4f0c6",
"89860fc475fcb2a2d86c4544df52ec8fd5e6533f",
"8dcaf4ef132900dd378f7be526c884b17452713b",
"8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
+ "8ec1a17ec19ae8ade95b9bdc837236981e83fffb",
"949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
"961ec451172f373f919c593737466300e42062cb",
"976989fa6e447578d9ce16ec5b526f0e09d6df50",
+ "977f23723027d7052df9b49eb467e6c0b9af93ff",
"98b02eb70066c182c705ef4d14d8b723ad7f1fab",
"993ca31f6974f8387bb18dd7d38987d290da8781",
"9dbd05af3442e6f42d67868054751b76973f4171",
+ "a1cc781c694a48e018f4de110b58f561aa212051",
"a2298c137b9c8c8975bad62eea9224edb95e6952",
"a2671738755ab8b24775e95375dc72f1ca4e5fd6",
"a26f299fb97c646effeebd4c5e2968786bd0f781",
@@ -106,6 +116,7 @@ BAD_PDF_SHA1HEX: List[str] = [
"a69665d0b5d3b95f54f68406eee3ed50c67efb45",
"a8357c31837404f9ebd798999d546c9398ab3648",
"a9162b9aef5e5da0897275fede1a6cff8cc93dfc",
+ "abc9d264df446707b40d7c9f79befd0f89291e59",
"ad038725bf6855a79f3c768ebe93c7103d14522f",
"aef581bf42e76e527f5aed3b8958fd4e7a24819f",
"b2b66b9c7f817a20144456f99c0be805602e8597",
@@ -116,9 +127,11 @@ BAD_PDF_SHA1HEX: List[str] = [
"b8b427e5b3d650ba9e03197f9c3917e25b878930",
"bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
"be0cda7642e9247b3ee41cd2017fa709aab4f344",
+ "beff1b0c24aa99989be73c66dfb1d1e7578e370b",
"c1b583fbd052572f08158d39ffe4d7510dadbebb",
"c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1",
"c4abbb284f4acaca9e8ceb88f842901984e84d33",
+ "c58e028269c8dfd3a442f6745c81b4c0e8610c43",
"c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
"c7687fa6f637c7d32a25be0e772867d87536d35c",
"c7d8b37ec99cf0d987e60667f05299f200e18a5d",
@@ -131,7 +144,9 @@ BAD_PDF_SHA1HEX: List[str] = [
"d188762a7e3ab5d4ee8a897204316513e4e636ec",
"d613b9e4442f5d5d19ea6814fa9729bff7da7c85",
"d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "d91d3830bf455e6dd782eee46218e35d29f07dfd",
"da2211ee2dbc6dda36571976d810e2366a3d2504",
+ "dbb3093a797e0ae83d39eb7b235ff85a17fd965c",
"e01bb7256d77aea258313bb410dfcfc10512f420",
"e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
"e2c3b8a2cf33d5e8972bc9ddb78373766a75e412",
@@ -142,6 +157,7 @@ BAD_PDF_SHA1HEX: List[str] = [
"eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36",
"eb1b39fd7a874896688855a22efddef10272427c",
"eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "ecc4b927c5e84e145c610876931bc261ae13769b",
"edf8dcc8736f06afbaca0e01d60bd2c475403a3d",
"ee2ee6ae2cf05128810d0d95bbe69bd263e140de",
"ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
@@ -150,6 +166,7 @@ BAD_PDF_SHA1HEX: List[str] = [
"f0ea221d8587cede25592266486e119d277f7096",
"f68f9a9202a75d2aee35252e104d796f9515001e",
"f9314d3bf2eac78a7d78d18adcccdb35542054ef",
+ "f932ef936021a3b00842b481478c40868b9a007c",
"fd9bd560662e070b222d63052830837829c490f0",
]
@@ -324,7 +341,8 @@ def process_pdf(
)
# this call sometimes fails an returns an AttributeError
page0rect = page0.page_rect()
- except (AttributeError, poppler.document.LockedDocumentError) as e:
+ # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch
+ except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e:
# may need to expand the set of exceptions caught here over time, but
# starting with a narrow set
return PdfExtractResult(