diff options
Diffstat (limited to 'python/sandcrawler/pdfextract.py')
-rw-r--r-- | python/sandcrawler/pdfextract.py | 20 |
1 files changed, 19 insertions, 1 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 6c18395..97d338e 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -69,35 +69,45 @@ BAD_PDF_SHA1HEX: List[str] = [ "58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff", "59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0", "5ab98405b676ee81a6ca74fba51a9e4a6cff7311", + "5c5b45c85eff07d4302844e00ec8baa57b988c60", "5e04779cbbae5ce88bb786064f756885dd6895fe", "5e6a3adde9f08c276c4efd72bfacb256f2ec35d9", + "62247fe6b8d3ca50477cafddbe24bf63832d6674", "623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac", "646c4a654270606256397684204ff0f3d17be2e7", "64d821d728f9a3dc944b4c03be00feea0b57e314", + "668b7d777203af4b261d21bf4669fc9b385062e1", "689b5cb3ddef213d612363a903f10d0358ea64d2", "6909f0b62d8b7835de3dec7777aad7f8ef507ee3", "74e617dc95555e8ca3aadd19d0c85b71cd77d1d9", + "7596438d77444a7c4228bb96fa4b394ba7d7e23b", "75c2662a96ccc48891228df7c85eb7d4da9dd621", "771f1ca0007a6fbed5b4a434c73f524f715d33c1", "776859635e9dc01d97b0582f49c814ffbcb019fb", "781dafda896a9f5c30f3d0a011f79a3b79b574c4", "788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb", "79d6cba3c6e577a0f3a3a9fe575680d38454938d", + "7b8b7e8e4b789579a7d2fda329db52528383a652", + "7c5c925cfb7c5a861b5c0a1d923308f9bedd335e", "7cfc0739be9c49d94272110a0a748256bdde9be6", "7daf61526ec825151f384cc1db510ca5237d5d80", "7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76", + "800e47a7ed214f7acac85cc29aa7b0f9c0e218ae", "8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67", "859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad", "88edcbab1cac2d70af5870422974afc253f4f0c6", "89860fc475fcb2a2d86c4544df52ec8fd5e6533f", "8dcaf4ef132900dd378f7be526c884b17452713b", "8e4f03c29ae1fe7227140ab4b625f375f6c00d31", + "8ec1a17ec19ae8ade95b9bdc837236981e83fffb", "949dfb7d833da9576b2ccb9eb1ab5457469c53d3", "961ec451172f373f919c593737466300e42062cb", "976989fa6e447578d9ce16ec5b526f0e09d6df50", + "977f23723027d7052df9b49eb467e6c0b9af93ff", "98b02eb70066c182c705ef4d14d8b723ad7f1fab", "993ca31f6974f8387bb18dd7d38987d290da8781", "9dbd05af3442e6f42d67868054751b76973f4171", + "a1cc781c694a48e018f4de110b58f561aa212051", "a2298c137b9c8c8975bad62eea9224edb95e6952", "a2671738755ab8b24775e95375dc72f1ca4e5fd6", "a26f299fb97c646effeebd4c5e2968786bd0f781", @@ -106,6 +116,7 @@ BAD_PDF_SHA1HEX: List[str] = [ "a69665d0b5d3b95f54f68406eee3ed50c67efb45", "a8357c31837404f9ebd798999d546c9398ab3648", "a9162b9aef5e5da0897275fede1a6cff8cc93dfc", + "abc9d264df446707b40d7c9f79befd0f89291e59", "ad038725bf6855a79f3c768ebe93c7103d14522f", "aef581bf42e76e527f5aed3b8958fd4e7a24819f", "b2b66b9c7f817a20144456f99c0be805602e8597", @@ -116,9 +127,11 @@ BAD_PDF_SHA1HEX: List[str] = [ "b8b427e5b3d650ba9e03197f9c3917e25b878930", "bad48b89b639b5b7df2c6a2d5288181fcb8b0e35", "be0cda7642e9247b3ee41cd2017fa709aab4f344", + "beff1b0c24aa99989be73c66dfb1d1e7578e370b", "c1b583fbd052572f08158d39ffe4d7510dadbebb", "c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1", "c4abbb284f4acaca9e8ceb88f842901984e84d33", + "c58e028269c8dfd3a442f6745c81b4c0e8610c43", "c7220d1bf1e71fb755d9f26bbdd4c539dc162960", "c7687fa6f637c7d32a25be0e772867d87536d35c", "c7d8b37ec99cf0d987e60667f05299f200e18a5d", @@ -131,7 +144,9 @@ BAD_PDF_SHA1HEX: List[str] = [ "d188762a7e3ab5d4ee8a897204316513e4e636ec", "d613b9e4442f5d5d19ea6814fa9729bff7da7c85", "d6b0f405bf13c23d0e90c54eea527442786d1cd3", + "d91d3830bf455e6dd782eee46218e35d29f07dfd", "da2211ee2dbc6dda36571976d810e2366a3d2504", + "dbb3093a797e0ae83d39eb7b235ff85a17fd965c", "e01bb7256d77aea258313bb410dfcfc10512f420", "e2bf5d0a5885359381fe8ef2cd9290171d494e9b", "e2c3b8a2cf33d5e8972bc9ddb78373766a75e412", @@ -142,6 +157,7 @@ BAD_PDF_SHA1HEX: List[str] = [ "eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36", "eb1b39fd7a874896688855a22efddef10272427c", "eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2", + "ecc4b927c5e84e145c610876931bc261ae13769b", "edf8dcc8736f06afbaca0e01d60bd2c475403a3d", "ee2ee6ae2cf05128810d0d95bbe69bd263e140de", "ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7", @@ -150,6 +166,7 @@ BAD_PDF_SHA1HEX: List[str] = [ "f0ea221d8587cede25592266486e119d277f7096", "f68f9a9202a75d2aee35252e104d796f9515001e", "f9314d3bf2eac78a7d78d18adcccdb35542054ef", + "f932ef936021a3b00842b481478c40868b9a007c", "fd9bd560662e070b222d63052830837829c490f0", ] @@ -324,7 +341,8 @@ def process_pdf( ) # this call sometimes fails an returns an AttributeError page0rect = page0.page_rect() - except (AttributeError, poppler.document.LockedDocumentError) as e: + # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch + except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e: # may need to expand the set of exceptions caught here over time, but # starting with a narrow set return PdfExtractResult( |