note about text layout with pdf extraction

author: Bryan Newbold <bnewbold@archive.org> 2020-06-17 18:05:06 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-06-17 18:05:06 -0700
commit: 5d10b14f53d81626e489579f1e53b64749609f00 (patch)
tree: 157fe50d65ab795fb77a442e1bb16ea8cc54a761 /python
parent: 55fca256e26ef53c4a9f59d074a835f87ee5b79f (diff)
download: sandcrawler-5d10b14f53d81626e489579f1e53b64749609f00.tar.gz
sandcrawler-5d10b14f53d81626e489579f1e53b64749609f00.zip
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index cfba679..c950d18 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -45,6 +45,14 @@ class PdfExtractResult:
 
 
 def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult:
+    """
+    A known issue is that output text is in "physical layout" mode, which means
+    columns will be side-by-side. We would prefer a single stream of tokens!
+
+    Tried using page.text(layout_mode=poppler.TextLayout.raw_order_layout)
+    instead of the default mode (poppler.TextLayout.physical_layout), but that
+    didn't seem to work at all (returned empty strings).
+    """
     file_meta = gen_file_metadata(blob)
     sha1hex = file_meta['sha1hex']
     if file_meta['mimetype'] != 'application/pdf':
author	Bryan Newbold <bnewbold@archive.org>	2020-06-17 18:05:06 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-06-17 18:05:06 -0700
commit	5d10b14f53d81626e489579f1e53b64749609f00 (patch)
tree	157fe50d65ab795fb77a442e1bb16ea8cc54a761 /python
parent	55fca256e26ef53c4a9f59d074a835f87ee5b79f (diff)
download	sandcrawler-5d10b14f53d81626e489579f1e53b64749609f00.tar.gz sandcrawler-5d10b14f53d81626e489579f1e53b64749609f00.zip