From 155a4c9918cf052bed7b73165a7a9ed965e69c6e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 25 Jun 2020 16:02:30 -0700 Subject: pdfextract_tool fixes from prod usage --- python/pdfextract_tool.py | 7 +++++-- python/sandcrawler/pdfextract.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py index e05d48d..0d33ec9 100755 --- a/python/pdfextract_tool.py +++ b/python/pdfextract_tool.py @@ -61,8 +61,11 @@ def run_single(args): worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink) with open(args.pdf_file, 'rb') as pdf_file: pdf_bytes = pdf_file.read() - result = worker.process(pdf_bytes) - print(json.dumps(result.to_pdftext_dict(), sort_keys=True)) + worker.push_record(pdf_bytes) + worker.finish() + if args.thumbnail_sink: + args.thumbnail_sink.finish() + def main(): diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index efb6cca..4606632 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -259,5 +259,5 @@ class PdfExtractBlobWorker(SandcrawlerWorker): if self.thumbnail_sink and result.page0_thumbnail is not None: self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex) - return result + return result.to_pdftext_dict() -- cgit v1.2.3