aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/__init__.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-16 17:28:33 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-16 17:28:36 -0700
commit5c32007e23a4f3b6902b760b5e06e4dd341918b3 (patch)
tree86fe446ef6f980d09fa95867ddb0bae847cc2765 /python/sandcrawler/__init__.py
parentd49ea4fb3f567351c63816e703348d8a9fd49ff0 (diff)
downloadsandcrawler-5c32007e23a4f3b6902b760b5e06e4dd341918b3.tar.gz
sandcrawler-5c32007e23a4f3b6902b760b5e06e4dd341918b3.zip
initial work on PDF extraction worker
This worker fetches full PDFs, then extracts thumbnails, raw text, and PDF metadata. Similar to GROBID worker.
Diffstat (limited to 'python/sandcrawler/__init__.py')
-rw-r--r--python/sandcrawler/__init__.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 654df35..2e5efd7 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -7,4 +7,4 @@ from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePage
from .ingest import IngestFileWorker
from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker
from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-
+from .pdf import PdfExtractWorker