aboutsummaryrefslogtreecommitdiffstats
path: root/python/pdftrio_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-18 19:01:58 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-18 19:01:58 -0800
commit4cf5345040b4e8a5d77ca3ceb0f7ea4f8c5778dc (patch)
treed4068ee5f77baa403286f11d04eeb23054fcf920 /python/pdftrio_tool.py
parent505a7253abb41d55ec0004e26cfdef033c9c8c74 (diff)
downloadsandcrawler-4cf5345040b4e8a5d77ca3ceb0f7ea4f8c5778dc.tar.gz
sandcrawler-4cf5345040b4e8a5d77ca3ceb0f7ea4f8c5778dc.zip
pdftrio: mode controlled by CLI arg
Diffstat (limited to 'python/pdftrio_tool.py')
-rwxr-xr-xpython/pdftrio_tool.py15
1 files changed, 9 insertions, 6 deletions
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 843c214..ec92afe 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -6,7 +6,7 @@ text extraction.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
"""
import sys
@@ -21,11 +21,11 @@ def run_classify_pdf_json(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None)
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink)
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
@@ -33,7 +33,7 @@ def run_classify_pdf_cdx(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None)
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = CdxLinePusher(
multi_worker,
@@ -43,7 +43,7 @@ def run_classify_pdf_cdx(args):
batch_size=args.jobs,
)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink)
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
pusher = CdxLinePusher(
worker,
args.cdx_file,
@@ -54,7 +54,7 @@ def run_classify_pdf_cdx(args):
def run_classify_pdf_zipfile(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
- worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink)
+ worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink, mode=args.pdftrio_mode)
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
@@ -77,6 +77,9 @@ def main():
parser.add_argument('--pdftrio-host',
default="http://pdftrio.qa.fatcat.wiki",
help="pdftrio API host/port")
+ parser.add_argument('--pdftrio-mode',
+ default="auto",
+ help="which classification mode to use")
subparsers = parser.add_subparsers()
sub_classify_pdf_json = subparsers.add_parser('classify-pdf-json',