aboutsummaryrefslogtreecommitdiffstats
path: root/python/pdfextract_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 16:02:30 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 16:02:30 -0700
commit155a4c9918cf052bed7b73165a7a9ed965e69c6e (patch)
treecb1c3079aa6a54c6d88218a81e8c23887f23678a /python/pdfextract_tool.py
parent445532ac28519508713306e7ad4ab8524333a367 (diff)
downloadsandcrawler-155a4c9918cf052bed7b73165a7a9ed965e69c6e.tar.gz
sandcrawler-155a4c9918cf052bed7b73165a7a9ed965e69c6e.zip
pdfextract_tool fixes from prod usage
Diffstat (limited to 'python/pdfextract_tool.py')
-rwxr-xr-xpython/pdfextract_tool.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index e05d48d..0d33ec9 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -61,8 +61,11 @@ def run_single(args):
worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
with open(args.pdf_file, 'rb') as pdf_file:
pdf_bytes = pdf_file.read()
- result = worker.process(pdf_bytes)
- print(json.dumps(result.to_pdftext_dict(), sort_keys=True))
+ worker.push_record(pdf_bytes)
+ worker.finish()
+ if args.thumbnail_sink:
+ args.thumbnail_sink.finish()
+
def main():