aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/__init__.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-12 19:40:55 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-12 19:42:43 -0800
commit94912e739c51d2fa4d5f9de878d0b0f0544a4459 (patch)
treeaf7803bee388beba7dd6dce2113e3632284537ac /python/sandcrawler/__init__.py
parent6b3ce3169847a16fe6c0ab00f3a8af8b8ad099ab (diff)
downloadsandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.tar.gz
sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.zip
pdftrio basic python code
This is basically just a copy/paste of GROBID code, only simpler!
Diffstat (limited to 'python/sandcrawler/__init__.py')
-rw-r--r--python/sandcrawler/__init__.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 2d28829..b52d039 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,9 +1,10 @@
from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
+from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest import IngestFileWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker
+from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker
from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient