aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 16:02:30 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 16:02:30 -0700
commit155a4c9918cf052bed7b73165a7a9ed965e69c6e (patch)
treecb1c3079aa6a54c6d88218a81e8c23887f23678a
parent445532ac28519508713306e7ad4ab8524333a367 (diff)
downloadsandcrawler-155a4c9918cf052bed7b73165a7a9ed965e69c6e.tar.gz
sandcrawler-155a4c9918cf052bed7b73165a7a9ed965e69c6e.zip
pdfextract_tool fixes from prod usage
-rwxr-xr-xpython/pdfextract_tool.py7
-rw-r--r--python/sandcrawler/pdfextract.py2
2 files changed, 6 insertions, 3 deletions
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index e05d48d..0d33ec9 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -61,8 +61,11 @@ def run_single(args):
worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
with open(args.pdf_file, 'rb') as pdf_file:
pdf_bytes = pdf_file.read()
- result = worker.process(pdf_bytes)
- print(json.dumps(result.to_pdftext_dict(), sort_keys=True))
+ worker.push_record(pdf_bytes)
+ worker.finish()
+ if args.thumbnail_sink:
+ args.thumbnail_sink.finish()
+
def main():
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index efb6cca..4606632 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -259,5 +259,5 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
if self.thumbnail_sink and result.page0_thumbnail is not None:
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
- return result
+ return result.to_pdftext_dict()