aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/ia.py2
-rw-r--r--python/sandcrawler/pdf.py35
2 files changed, 22 insertions, 15 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 24ff619..49f5ad4 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -11,7 +11,7 @@ from collections import namedtuple
import http.client
# not sure this will really work. Should go before wayback imports.
-http.client._MAXHEADERS = 1000
+http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
from http.client import IncompleteRead
diff --git a/python/sandcrawler/pdf.py b/python/sandcrawler/pdf.py
index b55e2bb..b9baed7 100644
--- a/python/sandcrawler/pdf.py
+++ b/python/sandcrawler/pdf.py
@@ -16,15 +16,16 @@ from .ia import WaybackClient, WaybackError, PetaboxError
class PdfExtractResult:
sha1hex: str
status: str
- error_msg: Optional[str]
- file_meta: Optional[Dict[str,Any]]
- text: Optional[str]
- page0_thumbnail: Optional[bytes]
- meta_xml: Optional[str]
- pdf_info: Optional[Dict[str,Any]]
- pdf_extra: Optional[Dict[str,Any]]
-
- def to_text_dict(self) -> dict:
+ error_msg: Optional[str] = None
+ file_meta: Optional[Dict[str,Any]] = None
+ text: Optional[str] = None
+ page0_thumbnail: Optional[bytes] = None
+ meta_xml: Optional[str] = None
+ pdf_info: Optional[Dict[str,Any]] = None
+ pdf_extra: Optional[Dict[str,Any]] = None
+ source: Optional[Dict[str,Any]] = None
+
+ def to_pdftext_dict(self) -> dict:
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
@@ -38,6 +39,7 @@ class PdfExtractResult:
'meta_xml': self.meta_xml,
'pdf_info': self.pdf_info,
'pdf_extra': self.pdf_extra,
+ 'source': self.source,
}
@@ -71,9 +73,9 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
img.thumbnail(thumb_size, Image.BICUBIC)
buf = BytesIO()
img.save(buf, thumb_type)
- page0_thumbnail = buf.bytes.getvalue()
+ page0_thumbnail = buf.getvalue()
# assuming that very small images mean something went wrong
- if len(page0_thumbnail) < 50:
+ if page0_thumbnail is None or len(page0_thumbnail) < 50:
page0_thumbnail = None
except Exception as e:
print(str(e), file=sys.stderr)
@@ -116,7 +118,7 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
self.sink = sink
self.thumbnail_sink = kwargs.get('thumbnail_sink')
- def timeout_response(self, task):
+ def timeout_response(self, task) -> Dict:
default_key = task['sha1hex']
return dict(
status="error-timeout",
@@ -137,7 +139,7 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
result.source = record
if self.thumbnail_sink and result.page0_thumbnail is not None:
self.thumbnail_sink.push_record(result.page0_thumbnail)
- return result.to_thing()
+ return result.to_pdftext_dict()
class PdfExtractBlobWorker(SandcrawlerWorker):
"""
@@ -148,10 +150,15 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
def __init__(self, sink=None, **kwargs):
super().__init__()
self.sink = sink
+ self.thumbnail_sink = kwargs.get('thumbnail_sink')
- def process(self, blob):
+ def process(self, blob, key: Optional[str] = None):
if not blob:
return None
+
result = process_pdf(blob)
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail)
+
return result