aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-05 13:06:58 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-05 13:06:58 -0700
commitf4c2800109fe14af19137eac9760026f0efb0c03 (patch)
tree831fb395529382a916b6bdcc4c02be0156574f9b /python
parentae531a3314742deb1bdd2560ffbcaa2d1f8d829b (diff)
downloadsandcrawler-f4c2800109fe14af19137eac9760026f0efb0c03.tar.gz
sandcrawler-f4c2800109fe14af19137eac9760026f0efb0c03.zip
more bad PDF sha1; print sha1 before poppler extract
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdfextract.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index ff68503..be799bb 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -27,6 +27,12 @@ BAD_PDF_SHA1HEX = [
"445968ef735b228c08c3ff4238d99fc9f4824619",
"09cba9b00494d12759c50cb914f1fb7c9746f5d1",
"447fa6b5a90742a86429a932f6608d8e141688c0",
+ "0641822e68c5a07538b967489fd19a1d5dc371a5",
+ "09db7c9f2efb496c974427a61e84292ae27fc702",
+ "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
+ "018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
+ "20589d9dd0a22c8c938ad97b7f4f12648aa119fa",
+ "182749ad1db1d5e999d07f010bdcfc2978dadc88",
]
@dataclass
@@ -172,6 +178,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
file_meta=file_meta,
)
+ print(f"\tpoppler processing: {sha1hex}", file=sys.stderr)
try:
pdf = poppler.load_from_data(blob)
if pdf is None: