diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-05 13:06:58 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-05 13:06:58 -0700 |
commit | f4c2800109fe14af19137eac9760026f0efb0c03 (patch) | |
tree | 831fb395529382a916b6bdcc4c02be0156574f9b | |
parent | ae531a3314742deb1bdd2560ffbcaa2d1f8d829b (diff) | |
download | sandcrawler-f4c2800109fe14af19137eac9760026f0efb0c03.tar.gz sandcrawler-f4c2800109fe14af19137eac9760026f0efb0c03.zip |
more bad PDF sha1; print sha1 before poppler extract
-rw-r--r-- | python/sandcrawler/pdfextract.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index ff68503..be799bb 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -27,6 +27,12 @@ BAD_PDF_SHA1HEX = [ "445968ef735b228c08c3ff4238d99fc9f4824619", "09cba9b00494d12759c50cb914f1fb7c9746f5d1", "447fa6b5a90742a86429a932f6608d8e141688c0", + "0641822e68c5a07538b967489fd19a1d5dc371a5", + "09db7c9f2efb496c974427a61e84292ae27fc702", + "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", + "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", + "20589d9dd0a22c8c938ad97b7f4f12648aa119fa", + "182749ad1db1d5e999d07f010bdcfc2978dadc88", ] @dataclass @@ -172,6 +178,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr file_meta=file_meta, ) + print(f"\tpoppler processing: {sha1hex}", file=sys.stderr) try: pdf = poppler.load_from_data(blob) if pdf is None: |