From 94912e739c51d2fa4d5f9de878d0b0f0544a4459 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Feb 2020 19:40:55 -0800 Subject: pdftrio basic python code This is basically just a copy/paste of GROBID code, only simpler! --- python/sandcrawler/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler/__init__.py') diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 2d28829..b52d039 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,9 +1,10 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker +from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker -from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker +from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient -- cgit v1.2.3