From d2fb570038ced65e6890e689e900a0f1aaed917c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 11:12:59 -0700 Subject: add new pdf workers/persisters --- python/sandcrawler/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'python/sandcrawler/__init__.py') diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 2e5efd7..a01d1f8 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -5,6 +5,6 @@ from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker -from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker +from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient -from .pdf import PdfExtractWorker +from .pdf import PdfExtractWorker, PdfExtractBlobWorker -- cgit v1.2.3