diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-16 17:28:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-16 17:28:36 -0700 |
commit | 5c32007e23a4f3b6902b760b5e06e4dd341918b3 (patch) | |
tree | 86fe446ef6f980d09fa95867ddb0bae847cc2765 /python/sandcrawler/__init__.py | |
parent | d49ea4fb3f567351c63816e703348d8a9fd49ff0 (diff) | |
download | sandcrawler-5c32007e23a4f3b6902b760b5e06e4dd341918b3.tar.gz sandcrawler-5c32007e23a4f3b6902b760b5e06e4dd341918b3.zip |
initial work on PDF extraction worker
This worker fetches full PDFs, then extracts thumbnails, raw text, and
PDF metadata. Similar to GROBID worker.
Diffstat (limited to 'python/sandcrawler/__init__.py')
-rw-r--r-- | python/sandcrawler/__init__.py | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 654df35..2e5efd7 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -7,4 +7,4 @@ from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePage from .ingest import IngestFileWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient - +from .pdf import PdfExtractWorker |