aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 23:08:47 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 23:08:47 -0700
commit254f24ad6566c9d4b5814868911b604802847b58 (patch)
tree20e9df75d4b89a29d658f57821899044cab34956 /python
parentfa25698620f54207423c3da40ae1bca567b598fa (diff)
downloadsandcrawler-254f24ad6566c9d4b5814868911b604802847b58.tar.gz
sandcrawler-254f24ad6566c9d4b5814868911b604802847b58.zip
simpler handling of null PDF text pages
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdfextract.py15
1 files changed, 4 insertions, 11 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 921c883..ed323b9 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -194,18 +194,11 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
print(str(e), file=sys.stderr)
page0_thumbnail = None
- try:
- full_text = page0.text()
- for n in range(1, pdf.pages):
- pageN = pdf.create_page(n)
+ full_text = page0.text()
+ for n in range(1, pdf.pages):
+ pageN = pdf.create_page(n)
+ if pageN:
full_text += pageN.text()
- except AttributeError as e:
- return PdfExtractResult(
- sha1hex=sha1hex,
- status='parse-error',
- error_msg=str(e),
- file_meta=file_meta,
- )
# Kafka message size limit; cap at about 1 MByte
if len(full_text)> 1000000: