diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 18:04:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 18:04:37 -0700 |
commit | 55fca256e26ef53c4a9f59d074a835f87ee5b79f (patch) | |
tree | c33106e854aef088b841647cb32b15dbe7ca54e8 /python/pdfextract_tool.py | |
parent | e03f4eda875f4a36a3d3e62eac467b3f2b1e0fbe (diff) | |
download | sandcrawler-55fca256e26ef53c4a9f59d074a835f87ee5b79f.tar.gz sandcrawler-55fca256e26ef53c4a9f59d074a835f87ee5b79f.zip |
lint fixes
Diffstat (limited to 'python/pdfextract_tool.py')
-rwxr-xr-x | python/pdfextract_tool.py | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py index ed8c2be..080bdbc 100755 --- a/python/pdfextract_tool.py +++ b/python/pdfextract_tool.py @@ -63,10 +63,11 @@ def run_extract_zipfile(args): pusher.run() def run_single(args): - worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=None) + worker = PdfExtractBlobWorker(sink=None, thumbnail_sink=None) with open(args.pdf_file, 'rb') as pdf_file: - result = worker.process(pdf_file.open()) - print(json.dumps(result, sort_keys=True)) + pdf_bytes = pdf_file.read() + result = worker.process(pdf_bytes) + print(json.dumps(result.to_pdftext_dict(), sort_keys=True)) def main(): @@ -130,6 +131,8 @@ def main(): produce_topic=thumbnail_topic) print("Running in kafka output mode, publishing to {} and {}\n".format( text_topic, thumbnail_topic), file=sys.stderr) + else: + args.sink = None args.func(args) |