diff options
Diffstat (limited to 'python')
-rwxr-xr-x | python/pdfextract_tool.py | 9 | ||||
-rwxr-xr-x | python/persist_tool.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/tests/test_pdfextract.py | 2 |
4 files changed, 9 insertions, 7 deletions
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py index ed8c2be..080bdbc 100755 --- a/python/pdfextract_tool.py +++ b/python/pdfextract_tool.py @@ -63,10 +63,11 @@ def run_extract_zipfile(args): pusher.run() def run_single(args): - worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=None) + worker = PdfExtractBlobWorker(sink=None, thumbnail_sink=None) with open(args.pdf_file, 'rb') as pdf_file: - result = worker.process(pdf_file.open()) - print(json.dumps(result, sort_keys=True)) + pdf_bytes = pdf_file.read() + result = worker.process(pdf_bytes) + print(json.dumps(result.to_pdftext_dict(), sort_keys=True)) def main(): @@ -130,6 +131,8 @@ def main(): produce_topic=thumbnail_topic) print("Running in kafka output mode, publishing to {} and {}\n".format( text_topic, thumbnail_topic), file=sys.stderr) + else: + args.sink = None args.func(args) diff --git a/python/persist_tool.py b/python/persist_tool.py index 4d78314..66e02aa 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -10,7 +10,6 @@ Kafka feeds, but sometimes we have bulk processing output we want to backfill. import os import sys import argparse -import datetime from sandcrawler import * from sandcrawler.persist import * @@ -158,7 +157,7 @@ def main(): help="only write status to sandcrawler-db (don't save TEI-XML to S3)") sub_pdftext = subparsers.add_parser('pdftext', - help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (minio)") + help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)") sub_pdftext.set_defaults(func=run_pdftext) sub_pdftext.add_argument('json_file', help="pdftext file to import from (or '-' for stdin)", diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index a01d1f8..71c2023 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -7,4 +7,4 @@ from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePage from .ingest import IngestFileWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient -from .pdf import PdfExtractWorker, PdfExtractBlobWorker +from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index 1ccf85c..2819d9f 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -4,7 +4,7 @@ import struct import responses from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient -from sandcrawler.pdf import process_pdf +from sandcrawler.pdfextract import process_pdf from test_wayback import wayback_client, cdx_client |