pdfextract: handle too-large fulltext

author: Bryan Newbold <bnewbold@archive.org> 2020-06-25 22:02:04 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-06-25 22:02:04 -0700
commit: 96513b5cc26dd91375a6433e49d24b2d9eb2bea9 (patch)
tree: a57b5ee38f3e8a6128829a1e649349283d499548
parent: bfb8aeb387e1f3583b7ef295124b2637b2c368e0 (diff)
download: sandcrawler-96513b5cc26dd91375a6433e49d24b2d9eb2bea9.tar.gz
sandcrawler-96513b5cc26dd91375a6433e49d24b2d9eb2bea9.zip
1 files changed, 17 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index a6c25c1..d1b060e 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -198,6 +198,23 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
     for n in range(1, pdf.pages):
         pageN = pdf.create_page(n)
         full_text += pageN.text()
+
+    # Kafka message size limit; cap at about 1 MByte
+    if len(full_text)> 1000000:
+        return PdfExtractResult(
+            sha1hex=sha1hex,
+            status='text-too-large',
+            error_msg="full_text chars: {}".format(len(full_text)),
+            file_meta=file_meta,
+        )
+    if len(pdf.metadata)> 1000000:
+        return PdfExtractResult(
+            sha1hex=sha1hex,
+            status='text-too-large',
+            error_msg="meta_xml chars: {}".format(len(full_text)),
+            file_meta=file_meta,
+        )
+
     pdf_info = pdf.infos()
     # TODO: is this actually needed? or does json marshalling work automatically?
     for k in pdf_info.keys():
author	Bryan Newbold <bnewbold@archive.org>	2020-06-25 22:02:04 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-06-25 22:02:04 -0700
commit	96513b5cc26dd91375a6433e49d24b2d9eb2bea9 (patch)
tree	a57b5ee38f3e8a6128829a1e649349283d499548
parent	bfb8aeb387e1f3583b7ef295124b2637b2c368e0 (diff)
download	sandcrawler-96513b5cc26dd91375a6433e49d24b2d9eb2bea9.tar.gz sandcrawler-96513b5cc26dd91375a6433e49d24b2d9eb2bea9.zip