diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:40:55 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:42:43 -0800 |
commit | 94912e739c51d2fa4d5f9de878d0b0f0544a4459 (patch) | |
tree | af7803bee388beba7dd6dce2113e3632284537ac /python/sandcrawler/__init__.py | |
parent | 6b3ce3169847a16fe6c0ab00f3a8af8b8ad099ab (diff) | |
download | sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.tar.gz sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.zip |
pdftrio basic python code
This is basically just a copy/paste of GROBID code, only simpler!
Diffstat (limited to 'python/sandcrawler/__init__.py')
-rw-r--r-- | python/sandcrawler/__init__.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 2d28829..b52d039 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,9 +1,10 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker +from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker -from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker +from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient |