diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 16:02:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 16:02:30 -0700 |
commit | 155a4c9918cf052bed7b73165a7a9ed965e69c6e (patch) | |
tree | cb1c3079aa6a54c6d88218a81e8c23887f23678a /python/pdfextract_tool.py | |
parent | 445532ac28519508713306e7ad4ab8524333a367 (diff) | |
download | sandcrawler-155a4c9918cf052bed7b73165a7a9ed965e69c6e.tar.gz sandcrawler-155a4c9918cf052bed7b73165a7a9ed965e69c6e.zip |
pdfextract_tool fixes from prod usage
Diffstat (limited to 'python/pdfextract_tool.py')
-rwxr-xr-x | python/pdfextract_tool.py | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py index e05d48d..0d33ec9 100755 --- a/python/pdfextract_tool.py +++ b/python/pdfextract_tool.py @@ -61,8 +61,11 @@ def run_single(args): worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink) with open(args.pdf_file, 'rb') as pdf_file: pdf_bytes = pdf_file.read() - result = worker.process(pdf_bytes) - print(json.dumps(result.to_pdftext_dict(), sort_keys=True)) + worker.push_record(pdf_bytes) + worker.finish() + if args.thumbnail_sink: + args.thumbnail_sink.finish() + def main(): |