diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 22:02:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 22:02:04 -0700 |
commit | 96513b5cc26dd91375a6433e49d24b2d9eb2bea9 (patch) | |
tree | a57b5ee38f3e8a6128829a1e649349283d499548 /python | |
parent | bfb8aeb387e1f3583b7ef295124b2637b2c368e0 (diff) | |
download | sandcrawler-96513b5cc26dd91375a6433e49d24b2d9eb2bea9.tar.gz sandcrawler-96513b5cc26dd91375a6433e49d24b2d9eb2bea9.zip |
pdfextract: handle too-large fulltext
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/pdfextract.py | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index a6c25c1..d1b060e 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -198,6 +198,23 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr for n in range(1, pdf.pages): pageN = pdf.create_page(n) full_text += pageN.text() + + # Kafka message size limit; cap at about 1 MByte + if len(full_text)> 1000000: + return PdfExtractResult( + sha1hex=sha1hex, + status='text-too-large', + error_msg="full_text chars: {}".format(len(full_text)), + file_meta=file_meta, + ) + if len(pdf.metadata)> 1000000: + return PdfExtractResult( + sha1hex=sha1hex, + status='text-too-large', + error_msg="meta_xml chars: {}".format(len(full_text)), + file_meta=file_meta, + ) + pdf_info = pdf.infos() # TODO: is this actually needed? or does json marshalling work automatically? for k in pdf_info.keys(): |