fixes and tweaks from testing locally

author: Bryan Newbold <bnewbold@archive.org> 2020-06-17 21:23:08 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-06-17 21:23:08 -0700
commit: b4a40d99b23a83eabeed490c0dce52dba31dc7b8 (patch)
tree: 325f3fb3b4cf389f2e82d458346be0100f06f8bc /python/sandcrawler/persist.py
parent: 83fc37ecff0176032542b2eee24b4f09d5c21db2 (diff)
download: sandcrawler-b4a40d99b23a83eabeed490c0dce52dba31dc7b8.tar.gz
sandcrawler-b4a40d99b23a83eabeed490c0dce52dba31dc7b8.zip
1 files changed, 18 insertions, 11 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 8d421ad..6d9298e 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,12 +20,14 @@ grobid
 """
 
 import os
+from typing import Optional
 import xml.etree.ElementTree
 
 from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgresClient
 from sandcrawler.minio import SandcrawlerMinioClient
 from sandcrawler.grobid import GrobidClient
+from sandcrawler.pdfextract import PdfExtractResult
 
 
 class PersistCdxWorker(SandcrawlerWorker):
@@ -404,29 +406,33 @@ class PersistPdfTextWorker(SandcrawlerWorker):
     def push_batch(self, batch):
         self.counts['total'] += len(batch)
 
+        parsed_batch = []
         for r in batch:
-            if r['status'] != 'success' or not r.get('text'):
+            parsed_batch.append(PdfExtractResult.from_pdftext_dict(r))
+
+        for r in parsed_batch:
+            if r.status != 'success' or not r.text:
                 self.counts['s3-skip-status'] += 1
-                if r.get('error_msg'):
-                    r['metadata'] = {'error_msg': r['error_msg'][:500]}
+                if r.error_msg:
+                    r.metadata = {'error_msg': r.error_msg[:500]}
                 continue
 
-            assert len(r['sha1hex']) == 40
+            assert len(r.sha1hex) == 40
             if not self.db_only:
                 resp = self.s3.put_blob(
                     folder="text",
-                    blob=r['text'],
-                    sha1hex=r['sha1hex'],
+                    blob=r.text,
+                    sha1hex=r.sha1hex,
                     extension=".txt",
                 )
                 self.counts['s3-put'] += 1
 
         if not self.s3_only:
-            resp = self.db.insert_pdf_meta(self.cur, batch, on_conflict="update")
+            resp = self.db.insert_pdf_meta(self.cur, parsed_batch, on_conflict="update")
             self.counts['insert-pdf-meta'] += resp[0]
             self.counts['update-pdf-meta'] += resp[1]
 
-            file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')]
+            file_meta_batch = [r.file_meta for r in parsed_batch if r.file_meta]
             resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
             self.counts['insert-file-meta'] += resp[0]
             self.counts['update-file-meta'] += resp[1]
@@ -454,12 +460,14 @@ class PersistThumbnailWorker(SandcrawlerWorker):
         self.s3_extension = kwargs.get('s3_extension', ".jpg")
         self.s3_folder = kwargs.get('s3_folder', "pdf")
 
-    def process(self, blob, key=None):
+    def process(self, blob: bytes, key: Optional[str] = None):
         """
         Processing raw messages, not decoded JSON objects
         """
 
-        assert key is not None and len(key) == 40
+        if isinstance(key, bytes):
+            key = key.decode('utf-8')
+        assert key is not None and len(key) == 40 and isinstance(key, str)
         assert isinstance(blob, bytes)
         assert len(blob) >= 50
 
@@ -470,5 +478,4 @@ class PersistThumbnailWorker(SandcrawlerWorker):
             extension=self.s3_extension,
         )
         self.counts['s3-put'] += 1
-        return True
author	Bryan Newbold <bnewbold@archive.org>	2020-06-17 21:23:08 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-06-17 21:23:08 -0700
commit	b4a40d99b23a83eabeed490c0dce52dba31dc7b8 (patch)
tree	325f3fb3b4cf389f2e82d458346be0100f06f8bc /python/sandcrawler/persist.py
parent	83fc37ecff0176032542b2eee24b4f09d5c21db2 (diff)
download	sandcrawler-b4a40d99b23a83eabeed490c0dce52dba31dc7b8.tar.gz sandcrawler-b4a40d99b23a83eabeed490c0dce52dba31dc7b8.zip