aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/pdfextract_tool.py9
-rwxr-xr-xpython/persist_tool.py3
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/tests/test_pdfextract.py2
4 files changed, 9 insertions, 7 deletions
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index ed8c2be..080bdbc 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -63,10 +63,11 @@ def run_extract_zipfile(args):
pusher.run()
def run_single(args):
- worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=None)
+ worker = PdfExtractBlobWorker(sink=None, thumbnail_sink=None)
with open(args.pdf_file, 'rb') as pdf_file:
- result = worker.process(pdf_file.open())
- print(json.dumps(result, sort_keys=True))
+ pdf_bytes = pdf_file.read()
+ result = worker.process(pdf_bytes)
+ print(json.dumps(result.to_pdftext_dict(), sort_keys=True))
def main():
@@ -130,6 +131,8 @@ def main():
produce_topic=thumbnail_topic)
print("Running in kafka output mode, publishing to {} and {}\n".format(
text_topic, thumbnail_topic), file=sys.stderr)
+ else:
+ args.sink = None
args.func(args)
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 4d78314..66e02aa 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -10,7 +10,6 @@ Kafka feeds, but sometimes we have bulk processing output we want to backfill.
import os
import sys
import argparse
-import datetime
from sandcrawler import *
from sandcrawler.persist import *
@@ -158,7 +157,7 @@ def main():
help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
sub_pdftext = subparsers.add_parser('pdftext',
- help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (minio)")
+ help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_pdftext.set_defaults(func=run_pdftext)
sub_pdftext.add_argument('json_file',
help="pdftext file to import from (or '-' for stdin)",
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index a01d1f8..71c2023 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -7,4 +7,4 @@ from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePage
from .ingest import IngestFileWorker
from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdf import PdfExtractWorker, PdfExtractBlobWorker
+from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 1ccf85c..2819d9f 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -4,7 +4,7 @@ import struct
import responses
from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
-from sandcrawler.pdf import process_pdf
+from sandcrawler.pdfextract import process_pdf
from test_wayback import wayback_client, cdx_client