aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xpython/pdfextract_tool.py7
-rw-r--r--python/sandcrawler/pdfextract.py2
2 files changed, 6 insertions, 3 deletions
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index e05d48d..0d33ec9 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -61,8 +61,11 @@ def run_single(args):
worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
with open(args.pdf_file, 'rb') as pdf_file:
pdf_bytes = pdf_file.read()
- result = worker.process(pdf_bytes)
- print(json.dumps(result.to_pdftext_dict(), sort_keys=True))
+ worker.push_record(pdf_bytes)
+ worker.finish()
+ if args.thumbnail_sink:
+ args.thumbnail_sink.finish()
+
def main():
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index efb6cca..4606632 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -259,5 +259,5 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
if self.thumbnail_sink and result.page0_thumbnail is not None:
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
- return result
+ return result.to_pdftext_dict()