From 5d10b14f53d81626e489579f1e53b64749609f00 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 18:05:06 -0700 Subject: note about text layout with pdf extraction --- python/sandcrawler/pdfextract.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index cfba679..c950d18 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -45,6 +45,14 @@ class PdfExtractResult: def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult: + """ + A known issue is that output text is in "physical layout" mode, which means + columns will be side-by-side. We would prefer a single stream of tokens! + + Tried using page.text(layout_mode=poppler.TextLayout.raw_order_layout) + instead of the default mode (poppler.TextLayout.physical_layout), but that + didn't seem to work at all (returned empty strings). + """ file_meta = gen_file_metadata(blob) sha1hex = file_meta['sha1hex'] if file_meta['mimetype'] != 'application/pdf': -- cgit v1.2.3